quarto::quarto_render(here("R", "01_load.qmd"))gene_enrichment_analysis
This script will run all our Analysis, from downloading the data until producing the plots for the presentation, and render the scripts. The resulting .html files can be found in results
Load libraries
Load all necessary Data
This script downloads the analysed dataset and unpacks it.
Load Libraries
library(here)Download raw data
folder <- here("data", "_raw")
local_path <- file.path(folder, "raw_data.tar.gz")if (!dir.exists(folder)) {
dir.create(folder, recursive = TRUE)
}
download.file("https://cbioportal-datahub.s3.amazonaws.com/tmb_mskcc_2018.tar.gz", local_path, mode = "wb")Unpack raw data
untar(
tarfile = local_path,
exdir = folder
)Cleaning and augmentation of Datasets
In these scripts, the separate data files are analysed for necessary cleaning and interesting patterns or correlations.
quarto::quarto_render(here("R", "02_clean.qmd"))
quarto::quarto_render(here("R", "03_augment.qmd"))Cleaning Script
Load Libraries
library(here)
library(tidyverse)
library(ggplot2)
library(readr)Load Data
folder <- here("data", "_raw")mutation <- read_delim(file.path(folder, "tmb_mskcc_2018/data_mutations.txt"), delim = "\t", col_names = TRUE)Rows: 20563 Columns: 120
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (24): Hugo_Symbol, Center, NCBI_Build, Chromosome, Strand, Consequence, ...
dbl (9): Entrez_Gene_Id, Start_Position, End_Position, t_ref_count, t_alt_c...
lgl (87): dbSNP_Val_Status, Matched_Norm_Sample_Barcode, Match_Norm_Seq_Alle...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_data_sv <- read_delim(file.path(folder, "tmb_mskcc_2018/data_sv.txt"), delim = "\t")Rows: 346 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (19): Sample_Id, SV_Status, Site1_Hugo_Symbol, Site1_Region, Site1_Chrom...
dbl (11): Site1_Region_Number, Site1_Position, Site2_Region_Number, Site2_Po...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data cleaning of mutations
NA handling
We begin by looking at all the columns with all nas in it.
all_na_columns <- mutation |>
summarise(across(.cols = everything(),
.fns = ~ all(is.na(.x)))) |>
pivot_longer(cols = everything(),
names_to = "column_name",
values_to = "is_all_na") |>
filter(is_all_na == TRUE) |>
pull(column_name)
print("Columns containing only NA values:")[1] "Columns containing only NA values:"
print(all_na_columns) [1] "dbSNP_Val_Status" "Matched_Norm_Sample_Barcode"
[3] "Match_Norm_Seq_Allele1" "Match_Norm_Seq_Allele2"
[5] "Tumor_Validation_Allele1" "Tumor_Validation_Allele2"
[7] "Match_Norm_Validation_Allele1" "Match_Norm_Validation_Allele2"
[9] "Verification_Status" "Sequencing_Phase"
[11] "Sequence_Source" "Validation_Method"
[13] "BAM_File" "Sequencer"
[15] "AA_MAF" "AFR_MAF"
[17] "AMR_MAF" "ASN_MAF"
[19] "Allele" "Amino_Acid_Change"
[21] "Amino_acids" "BIOTYPE"
[23] "CANONICAL" "CCDS"
[25] "CDS_position" "CLIN_SIG"
[27] "COMMENTS" "Comments"
[29] "DISTANCE" "DOMAINS"
[31] "EAS_MAF" "EA_MAF"
[33] "ENSP" "EUR_MAF"
[35] "ExAC_AF" "ExAC_AF_AFR"
[37] "ExAC_AF_AMR" "ExAC_AF_EAS"
[39] "ExAC_AF_FIN" "ExAC_AF_NFE"
[41] "ExAC_AF_OTH" "ExAC_AF_SAS"
[43] "Existing_variation" "FILTER"
[45] "Feature" "Feature_type"
[47] "GENE_PHENO" "GMAF"
[49] "Gene" "HGNC_ID"
[51] "HGVS_OFFSET" "HIGH_INF_POS"
[53] "IMPACT" "MA:FIS"
[55] "MA:FImpact" "MA:link.MSA"
[57] "MA:link.PDB" "MA:link.var"
[59] "MA:protein.change" "MINIMISED"
[61] "MOTIF_NAME" "MOTIF_POS"
[63] "MOTIF_SCORE_CHANGE" "PHENO"
[65] "PICK" "PUBMED"
[67] "PolyPhen" "SAS_MAF"
[69] "SIFT" "SOMATIC"
[71] "SWISSPROT" "SYMBOL"
[73] "SYMBOL_SOURCE" "TREMBL"
[75] "TSL" "Transcript"
[77] "UNIPARC" "VARIANT_CLASS"
[79] "all_effects" "amino_acid_change"
[81] "cDNA_Change" "cDNA_position"
[83] "cdna_change" "comments"
[85] "n_depth" "t_depth"
[87] "transcript"
sprintf("There are %i empty columns.",length(all_na_columns))[1] "There are 87 empty columns."
We drop all these columns:
data_mutation_without_nas <- mutation |>
select(-all_of(all_na_columns))
# Verify the result by comparing the number of columns
print(paste("Original columns:", ncol(mutation)))[1] "Original columns: 120"
print(paste("New columns:", ncol(data_mutation_without_nas)))[1] "New columns: 33"
The new data set looks like this
data_mutation_without_nas |>
slice_sample(n=5)# A tibble: 5 × 33
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 KMT2C 58508 MSKCC GRCh37 7 151945099
2 TP53 7157 MSKCC GRCh37 17 7577539
3 MAPK3 5595 MSKCC GRCh37 16 30128558
4 SMAD4 4089 MSKCC GRCh37 18 48584802
5 RAD50 10111 MSKCC GRCh37 5 131911521
# ℹ 27 more variables: End_Position <dbl>, Strand <chr>, Consequence <chr>,
# Variant_Classification <chr>, Variant_Type <chr>, Reference_Allele <chr>,
# Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>, dbSNP_RS <chr>,
# Tumor_Sample_Barcode <chr>, Validation_Status <chr>, Mutation_Status <chr>,
# Score <chr>, t_ref_count <dbl>, t_alt_count <dbl>, n_ref_count <dbl>,
# n_alt_count <dbl>, HGVSc <chr>, HGVSp <chr>, HGVSp_Short <chr>,
# Transcript_ID <chr>, RefSeq <chr>, Protein_position <dbl>, Codons <chr>, …
print("The resulting columns are:")[1] "The resulting columns are:"
colnames(data_mutation_without_nas) [1] "Hugo_Symbol" "Entrez_Gene_Id" "Center"
[4] "NCBI_Build" "Chromosome" "Start_Position"
[7] "End_Position" "Strand" "Consequence"
[10] "Variant_Classification" "Variant_Type" "Reference_Allele"
[13] "Tumor_Seq_Allele1" "Tumor_Seq_Allele2" "dbSNP_RS"
[16] "Tumor_Sample_Barcode" "Validation_Status" "Mutation_Status"
[19] "Score" "t_ref_count" "t_alt_count"
[22] "n_ref_count" "n_alt_count" "HGVSc"
[25] "HGVSp" "HGVSp_Short" "Transcript_ID"
[28] "RefSeq" "Protein_position" "Codons"
[31] "Hotspot" "ALLELE_NUM" "IS_NEW"
However, there are still columns with some NAs in it -> what to do about it?
na_containing_columns <- data_mutation_without_nas |>
summarise(across(
everything(),
.fns = ~ sum(is.na(.x))
)) |>
pivot_longer(
everything(),
names_to = "column_name",
values_to = "na_count"
) |>
filter(na_count > 0)
na_containing_columns# A tibble: 10 × 2
column_name na_count
<chr> <int>
1 Tumor_Seq_Allele1 2
2 dbSNP_RS 20026
3 HGVSc 518
4 HGVSp 1243
5 HGVSp_Short 518
6 RefSeq 1503
7 Protein_position 531
8 Codons 1239
9 ALLELE_NUM 14176
10 IS_NEW 14289
We see that dbSNP_RS is almost only NAs (20026 over 20563), thus we can drop this column. Apart from that, the only variables that have a significant number of NAs are ALLELE_NUM ans IS_NEW. It seems that ALLELE_NUM is an internal indicator, so we wouldn”t use it, therefor we”ll also drop it.
data_mutation_without_nas <- data_mutation_without_nas |>
select(-c("dbSNP_RS", "ALLELE_NUM"))For IS_NEW, we”ll check if NA corresponds to a not-new allele, in this case we would just infere the NAs to be null or 0.
is_new_values <- data_mutation_without_nas |>
summarize(unique_is_new = unique(IS_NEW))Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
print("The values in the IS_NEW column are:")[1] "The values in the IS_NEW column are:"
is_new_values # A tibble: 2 × 1
unique_is_new
<chr>
1 <NA>
2 NEWRECORD
So indeed the NAs here means that it”s not new. So we will replace NAs by NOTNEW, and then input NOTNEW by 0 and NEWRECORD by 1 (hotspot encoding is easier to use in code).
data_mutation_without_nas <- data_mutation_without_nas |>
mutate(IS_NEW = case_when(
is.na(IS_NEW) ~ 0,
IS_NEW == "NEWRECORD" ~ 1
))For the rest of the column with NAs, we will keep all their rows and just using na.rm = TRUE when using the columns (we can do inference bc they”re not statistical or calculated variable).
We will search all columns with constant value to also drop them as we already know their values.
is_single_value <- function(x) {
n_distinct(x) == 1
}
single_value_columns <- data_mutation_without_nas |>
summarise(across(everything(), is_single_value)) |>
pivot_longer(everything(), names_to = "column_name", values_to = "is_single_value") |>
filter(is_single_value == TRUE) |>
pull(column_name)
single_value_columns[1] "Center" "NCBI_Build" "Strand"
[4] "Validation_Status" "Score" "Hotspot"
So we can drop all these columns knowing their constant values.
data_mutation_without_nas <- data_mutation_without_nas |>
select(-c("Center","NCBI_Build","Strand","Validation_Status","Hotspot","Score"))is_factor <- function(x){
is_char <- is.character(x)
bool <- n_distinct(x)<=40
return(is_char & bool)
}
column_to_factor <- data_mutation_without_nas |>
select(where(is_factor)) |>
names()
print("The column to factor are:")[1] "The column to factor are:"
column_to_factor[1] "Chromosome" "Consequence" "Variant_Classification"
[4] "Variant_Type" "Mutation_Status"
Lot of NAs in ALLELE_NUM and IS_NEW, how to deal with?
data_mutation_without_nas <- data_mutation_without_nas |>
mutate(across(all_of(column_to_factor), factor))We see that for some rows there are several consequences -> we split them into several columns. First lets check how many na is there in it.
consequence_na <- data_mutation_without_nas |>
summarise(consequence_na = sum(is.na(Consequence))) |>
pull(consequence_na)
sprintf("There is %i nas in the consequence column.", as.integer(consequence_na))[1] "There is 0 nas in the consequence column."
data_mutation_without_nas <- data_mutation_without_nas |>
mutate(n_consequences = str_count(Consequence,",")+1)max_consequences <- data_mutation_without_nas$n_consequences |>
max()
sprintf("The max number of consequences for one row is %.4f",max_consequences)[1] "The max number of consequences for one row is 4.0000"
data_mutation_without_nas_wide <- data_mutation_without_nas |>
separate(col = "Consequence",
into = str_c("consequence_", 1:max_consequences),
sep = ",",
remove = TRUE)Warning: Expected 4 pieces. Missing pieces filled with `NA` in 20560 rows [1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
data_mutation_without_nas_long <- data_mutation_without_nas_wide |>
pivot_longer(
cols = contains("consequence_"),
names_to = NULL,
values_to = "consequence"
) |>
drop_na("consequence") |>
mutate(
consequence = as.factor(consequence),
patient_id = str_extract(Tumor_Sample_Barcode, "^P-\\d{7}")
) |>
relocate(c(patient_id,Tumor_Sample_Barcode),
.before = Entrez_Gene_Id)
data_mutation_without_nas_long |>
sample_n(5)# A tibble: 5 × 27
Hugo_Symbol patient_id Tumor_Sample_Barcode Entrez_Gene_Id Chromosome
<chr> <chr> <chr> <dbl> <fct>
1 PIK3CA P-0007916 P-0007916-T01-IM5 5290 3
2 SETD2 P-0009080 P-0009080-T01-IM5 29072 3
3 HGF P-0007083 P-0007083-T02-IM6 3082 7
4 DNMT1 P-0009706 P-0009706-T01-IM5 1786 19
5 CDKN2A P-0008912 P-0008912-T01-IM5 1029 9
# ℹ 22 more variables: Start_Position <dbl>, End_Position <dbl>,
# Variant_Classification <fct>, Variant_Type <fct>, Reference_Allele <chr>,
# Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>, Mutation_Status <fct>,
# t_ref_count <dbl>, t_alt_count <dbl>, n_ref_count <dbl>, n_alt_count <dbl>,
# HGVSc <chr>, HGVSp <chr>, HGVSp_Short <chr>, Transcript_ID <chr>,
# RefSeq <chr>, Protein_position <dbl>, Codons <chr>, IS_NEW <dbl>,
# n_consequences <dbl>, consequence <fct>
Export the cleaned data
if(!dir.exists(here("data"))) dir.create(here("data"))
data_mutation_without_nas_long |>
write_tsv(file = here("data","data_mutation_cleaned.tsv"))Structural Variants Cleaning and exporting
Discarding two columns that had a constant value: SOMATIC for SV_Status and GRCh37 for NCBI_Build.
cleaned_sv <- raw_data_sv |>
select(-SV_Status, -NCBI_Build)The file”s dimensions are inconsistent with the other files, and the Sample_ID column is not a unique identifier, so it cannot serve as a primary key for subsequent joins. We will export this cleaned file and handle the various merging operations later in the analysis, ensuring the appropriate joining method is applied based on the expected output.
cleaned_sv |>
write_tsv(file = here("data","sv_cleaned.tsv"))Augmentation Script
Load Libraries
library(here)
library(tidyverse)
library(readr)Load Data
folder <- here("data", "_raw")clinical_sample <- read_delim(file.path(folder, "tmb_mskcc_2018/data_clinical_sample.txt"), delim = "\t", skip = 4, col_names = TRUE)Rows: 1661 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (13): PATIENT_ID, SAMPLE_ID, CANCER_TYPE, SAMPLE_TYPE, SAMPLE_CLASS, MET...
dbl (3): SAMPLE_COVERAGE, AGE_AT_SEQ_REPORT, TMB_NONSYNONYMOUS
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinical_patient <- read_delim(file.path(folder, "tmb_mskcc_2018/data_clinical_patient.txt"), delim = "\t", skip = 4, col_names = TRUE)Rows: 1661 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): PATIENT_ID, SEX, OS_STATUS, AGE_GROUP, DRUG_TYPE
dbl (1): OS_MONTHS
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_panel_matrix <- read_delim(file.path(folder, "tmb_mskcc_2018/data_gene_panel_matrix.txt"), delim = "\t")Rows: 1661 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): SAMPLE_ID, mutations, structural_variants
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Patients Cleaning
clinical_patient <- clinical_patient |>
mutate(
OS_STATUS = case_when(
OS_STATUS == "0:LIVING" ~ 0,
OS_STATUS == "1:DECEASED" ~ 1
),
SEX_NUM = case_when(
SEX == "Male" ~ 0,
SEX == "Female" ~ 1
),
AGE_GROUP = factor(
AGE_GROUP,
levels = c("<30", "31-50", "50-60", "61-70", ">71"),
ordered = TRUE
)
)Cleaning Gene-Panel Matrix
gene_panel_matrix <- gene_panel_matrix |>
mutate(IMPACT_version = case_when(
mutations == "IMPACT341" ~ "IMPACT_v3",
mutations == "IMPACT410" ~ "IMPACT_v5",
mutations == "IMPACT468" ~ "IMPACT_v6"
)
)Merging Patient-Sample
patients_sample <- left_join(clinical_patient, clinical_sample, by = "PATIENT_ID")Merging Patient-Sample with Gene panel Matrix
patients_sample_gene_panel <- left_join(patients_sample, gene_panel_matrix, by = "SAMPLE_ID")Exporting the cleaned merged data
patients_sample_gene_panel|>
write_tsv(file = here("data","patient_sample_gene_panel.tsv"))Descriptive Scripts
These script explore all 5 Dataset.
quarto::quarto_render(here("R", "04_1_data_clinical_patient_analysis.qmd"))
quarto::quarto_render(here("R", "04_2_data_clinical_sample_analysis.qmd"))
quarto::quarto_render(here("R", "04_3_data_mutations_analysis.qmd"))
quarto::quarto_render(here("R", "04_4_gene_panel_mutation.qmd"))
quarto::quarto_render(here("R", "04_5_sv_class_analysis.qmd"))Patient Dataset
Importing libraries
Loading required package: ggpubr
Attaching package: 'survminer'
The following object is masked from 'package:survival':
myeloma
Loading required package: viridisLite
Importing data
patients_sample_gene_panel <- read_tsv(here("data", "patient_sample_gene_panel.tsv"))Subsetting Data
From the merge data set, we will do analysis on the patients information for now, so taking only the columns that were originally in the patient data set, diregarding the cancer types,..
patients <- patients_sample_gene_panel |>
select(c(SEX, OS_MONTHS, OS_STATUS, AGE_GROUP, DRUG_TYPE)) |>
mutate(AGE_GROUP = factor(
AGE_GROUP,
levels = c("<30", "31-50", "50-60", "61-70", ">71"),
ordered = TRUE
)
)Plots
First, we will be looking at the age distribution of Cancers in men and woman and separating the patients that died from the living ones.
patients |>
ggplot(aes(x = AGE_GROUP,
fill = factor(SEX))) +
geom_bar()+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha = 0.7)+
facet_wrap(
~OS_STATUS,
labeller = labeller(OS_STATUS =
c(`0` = "Alive",
`1` = "Deceased")
))+
labs(title="Proportion of Cancer diagnosed in Men and Woman at different ages and their current status",
x = "Age group at Diagnosis (years)",
fill = 'Gender')+
theme_minimal()+
theme(plot.title = element_text(size = 9))In both groups, dead or alive, the patients, men and woman that have a cancer diagnosis are usually between 61-70 years old. Cancers seems to be more diagnosed in man than in woman, at all ages.
Looking more into this age group (61-70), lets consider the drug given against cancers. There is also a comparison of the drug given at all ages, for reference. In both cases, male and female are separated (medications do not affect men and women in the same way).
patients |>
filter(AGE_GROUP == "61-70") |>
ggplot(mapping = aes(x = DRUG_TYPE,
fill = SEX))+
geom_bar()+
labs(title="Drug given to patients diagnosed with a cancer between 61-70years old, differenciated by male and woman",
x = "Drug",
fill = 'Gender')+
theme(plot.title = element_text(size = 10))+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha = 0.7)#Comparing it to Patients of All Ages
ggplot(patients,
mapping = aes(x = DRUG_TYPE,
fill = SEX))+
geom_bar()+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha = 0.7)+
labs(title="Drug given to patients, of all ages, differenciated by gender",
x = "Drug",
fill = 'Gender')+
theme_minimal()+
theme(plot.title = element_text(size = 10))Looking at the 61-70 years old again, We want to see the time these patients survived for depending on the drug they were given. Again man and Woman are separated.
patients |>
filter(AGE_GROUP == "61-70") |>
ggplot(mapping = aes(x = SEX,
y = OS_MONTHS,
fill = DRUG_TYPE))+
geom_boxplot()+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha = 0.7)+
labs(title = 'Box Plot of the Drug success rate based on the months the patients lived, separated by gender',
x = 'Gender',
y = 'Time survived (in months)',
fill = 'Medication')+
theme_minimal()+
theme(plot.title = element_text(size = 10))Here patients that had lived and those that are dead are mixed, could be see if there is a distinction, between the number of patients dead or alive, based on the same components of the previous plot.
patients |>
filter(AGE_GROUP == "61-70") |>
ggplot(mapping = aes(x = DRUG_TYPE,
y = OS_MONTHS,
fill = factor(OS_STATUS)))+
geom_boxplot()+
scale_fill_viridis(label = c('Alive', "Desceased"),
option = "viridis",
discrete = TRUE,
alpha = 0.7)+
facet_wrap(~ SEX)+
labs(title = 'Box Plot of the Drug success rate based on the proportion of patients status, separated by gender',
fill = 'Status',
x = 'Drug Type against Cancer',
y = 'Number of months survived')+
theme_minimal()+
theme(plot.title = element_text(size = 10))In General, PDL-1 is given more than other drugs, CTL4 is rarely given and sometimes Combo.
As for the 61-70 age group, woman given CTLA4, both survived and died the most.
For both men and women PDL-1 has the lowest death proportion. It could be that PDL-1 is generally more given, in cancers that are less aggressive, and combo or CTLA4 for more aggressive cancers that do not react to PDL-1.
Survival rate
With the patient data, containing Overall Survival months (OS_MONTHS) and Overall Survival Status (OS_STATUS), it is possible to compute a survival probability over time, using the Kaplan Meier method. The Survival package follows this principal and allows us to compute the survival rate of the patients. To do so, Surv() creates the formula object, associating the status of a patients to it’s survival time. Then, survfit() uses the formula object to fit it to the Kaplan Meier estimator, calculating the survival probabilities at each observed event time. The result can then be used to plot the Kaplan–Meier survival.
#Using the Kaplan Meier based survival function:
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~1, data = patients)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
lower = KM_f$lower,
upper = KM_f$upper)
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival))+
geom_step()+
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, color = 'blue') +
coord_cartesian(ylim = c(0,NA))+
labs(title = 'Overall Survival rate of the patients',
x = "Time (in months)",
y = 'Overall Survival Probability')+
theme_minimal()From the survival analysis, there is a rapid decline in the survival probability in the first months (20-30).
It would be interesting to look at the impact of the different variables between the patients, mainly if the age has an impact on survival or if the gender could also impact the survival probability. From this data set, we also have the drug that was given to each patients, we can see if a drug is more successful by looking at its survival probability.
By Gender:
#Using the Kaplan Meier based survival function, declining it to the different sobservations that we can make
#survival rate based on the gender of the patients:
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~SEX, data = patients)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata),
lower = KM_f$lower,
upper = KM_f$upper)
tibble_KM_f <- tibble_KM_f |>
mutate(strata = str_remove(strata, 'SEX ='))
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis_d(label = c('Female', 'Male'),
option = "viridis",
alpha = 1)+
coord_cartesian(ylim = c(0, NA))+
labs(title = 'Survival Curve of the patients depending on their gender',
x ='Time (in Months)',
y = 'Overall Survival Probability',
color = 'Gender')+
theme_minimal()Overall, the gender has a minimal impact on the survival probability.
By Age group:
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~AGE_GROUP, data = patients)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "AGE_GROUP="),
strata = factor(strata,
levels = c("<30", "31-50", "50-60", "61-70", ">71"),
ordered = TRUE
)
)
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis_d(option = "viridis",
alpha = 1)+
coord_cartesian(ylim = c(0, NA))+
labs(title = 'Survival Curve of the patients depending on their Age at the dignostic',
x = 'Time (in Months)',
y = 'Overall Survival Probability',
color = 'Age Group')+
theme_minimal()By Drug type
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~DRUG_TYPE, data = patients)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "DRUG_TYPE="))
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis_d(label = c('CTLA4', 'PD-1/PDL-1', 'Combo'))+
coord_cartesian(ylim = c(0, NA))+
labs(title = 'Survival Curve of the patients depending on their Medication',
x ='Time (in Months)',
y = 'Overall Survival Probability',
color = 'Drug')+
theme_minimal()Overall, the survival probability increases after 40 months as the slope of the curve is less steep. It is possible that in some cases only a few patients are left then, and were not reported further.
Sample Dataset
Load Packages
library(readr)
library(here)
library(tidyverse)
library(igraph)
library(ggraph)
library(viridis)
library(ggplot2)Define equal plot colors:
theme_set(theme_minimal())
scale_fill_discrete <- function(...) scale_fill_viridis_d(...)
scale_color_discrete <- function(...) scale_color_viridis_d(...)Exploratory Analysis
Load raw data:
Investigate the raw data, look for necessary cleaning
clinical_sample <- read_delim(here("data/_raw/tmb_mskcc_2018/data_clinical_sample.txt"), delim = "\t", skip = 4, col_names = TRUE)Categorical Variables:
clinical_sample |> select(CANCER_TYPE) |> unique()# A tibble: 11 × 1
CANCER_TYPE
<chr>
1 Breast Cancer
2 Esophagogastric Cancer
3 Bladder Cancer
4 Non-Small Cell Lung Cancer
5 Glioma
6 Head and Neck Cancer
7 Cancer of Unknown Primary
8 Renal Cell Carcinoma
9 Melanoma
10 Colorectal Cancer
11 Skin Cancer, Non-Melanoma
clinical_sample |> select(SAMPLE_TYPE) |> unique()# A tibble: 2 × 1
SAMPLE_TYPE
<chr>
1 Primary
2 Metastasis
clinical_sample |> select(SAMPLE_CLASS) |> unique()# A tibble: 1 × 1
SAMPLE_CLASS
<chr>
1 Tumor
clinical_sample |> select(METASTATIC_SITE) |> unique()# A tibble: 79 × 1
METASTATIC_SITE
<chr>
1 Not Applicable
2 Lymph Node
3 Liver
4 Lung
5 Pleura
6 Ovary
7 Abdomen
8 Bone
9 Lymph Node, Non-Regional
10 Heart
# ℹ 69 more rows
clinical_sample |> select(PRIMARY_SITE) |> unique()# A tibble: 69 × 1
PRIMARY_SITE
<chr>
1 Breast
2 Esophagus
3 Bladder
4 Lung
5 Stomach
6 Urethra
7 Brain
8 Maxilla
9 Unknown
10 Kidney
# ℹ 59 more rows
clinical_sample |> select(GENE_PANEL) |> unique()# A tibble: 3 × 1
GENE_PANEL
<chr>
1 IMPACT341
2 IMPACT410
3 IMPACT468
Are patient ids and sample ids unique here? Yes. We have one sample per patient.
length(unique(clinical_sample$PATIENT_ID))[1] 1661
length(unique(clinical_sample$SAMPLE_ID))[1] 1661
Number Metastatic vs Primary
clinical_sample |>
group_by(SAMPLE_TYPE)|>
summarise(n= n())# A tibble: 2 × 2
SAMPLE_TYPE n
<chr> <int>
1 Metastasis 930
2 Primary 731
Age distribution:
ggplot(clinical_sample, aes(AGE_AT_SEQ_REPORT))+
geom_bar()+
labs(title ="Age distribution")Warning: Removed 1 row containing non-finite outside the scale range
(`stat_count()`).
Investigate the warning: One Sample without recorded Age and TMB
clinical_sample |> filter(is.na(AGE_AT_SEQ_REPORT))# A tibble: 1 × 16
PATIENT_ID SAMPLE_ID CANCER_TYPE SAMPLE_TYPE SAMPLE_CLASS METASTATIC_SITE
<chr> <chr> <chr> <chr> <chr> <chr>
1 P-0010842 P-0010842-T01… Non-Small … Primary Tumor Not Applicable
# ℹ 10 more variables: PRIMARY_SITE <chr>, CANCER_TYPE_DETAILED <chr>,
# GENE_PANEL <chr>, SAMPLE_COVERAGE <dbl>, TUMOR_PURITY <chr>,
# ONCOTREE_CODE <chr>, INSTITUTE <chr>, SOMATIC_STATUS <chr>,
# AGE_AT_SEQ_REPORT <dbl>, TMB_NONSYNONYMOUS <dbl>
TMB Distribution:
Many samples with low TMB
ggplot(clinical_sample, aes(TMB_NONSYNONYMOUS))+
geom_density()+
labs(title = "TMB Distribution")Investigate high samples with high TMB
clinical_sample |>
select(PATIENT_ID, SAMPLE_ID, TMB_NONSYNONYMOUS)|>
arrange(desc(TMB_NONSYNONYMOUS)) |>
head()# A tibble: 6 × 3
PATIENT_ID SAMPLE_ID TMB_NONSYNONYMOUS
<chr> <chr> <dbl>
1 P-0005021 P-0005021-T01-IM5 207.
2 P-0011357 P-0011357-T01-IM5 200.
3 P-0000686 P-0000686-T01-IM3 180.
4 P-0001808 P-0001808-T01-IM3 176.
5 P-0013826 P-0013826-T01-IM5 162.
6 P-0010499 P-0010499-T01-IM5 152.
The dataset is quite clean. The sample without recorded age might cause issue when using Age or TMB as a parameter in other analysis, but since its other values are recorded we decided against a complete removal of the sample. In analysis with Age or TMB, especially if the method cannot handle zero or NA values, this sample needs to be removed prior.
Exploratory Analysis on merged Dataset
Load merged dataset with Patient information
patient_sample <- read_tsv(file = here("data","patient_sample_gene_panel.tsv"))Previous findings
Look at patient without recorded age. Here it is recorded at >71
patient_sample |> filter(is.na(AGE_AT_SEQ_REPORT))# A tibble: 1 × 25
PATIENT_ID SEX OS_MONTHS OS_STATUS AGE_GROUP DRUG_TYPE SEX_NUM SAMPLE_ID
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr>
1 P-0010842 Male 0 1 >71 PD-1/PDL-1 0 P-0010842-T…
# ℹ 17 more variables: CANCER_TYPE <chr>, SAMPLE_TYPE <chr>,
# SAMPLE_CLASS <chr>, METASTATIC_SITE <chr>, PRIMARY_SITE <chr>,
# CANCER_TYPE_DETAILED <chr>, GENE_PANEL <chr>, SAMPLE_COVERAGE <dbl>,
# TUMOR_PURITY <chr>, ONCOTREE_CODE <chr>, INSTITUTE <chr>,
# SOMATIC_STATUS <chr>, AGE_AT_SEQ_REPORT <dbl>, TMB_NONSYNONYMOUS <dbl>,
# mutations <chr>, structural_variants <chr>, IMPACT_version <chr>
Number of tumors per organ
Count how many samples were taken per organ.
patient_sample |>
mutate(sample_site = ifelse(METASTATIC_SITE == "Not Applicable", PRIMARY_SITE, METASTATIC_SITE))|>
ggplot( aes(sample_site))+
geom_bar()+
labs(
title = "Number of Samples per Organ"
)+
theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1, size = 4))There are many organs which only have very few samples.
Filter for organs with > 20 tumors sampled.
patient_sample <- patient_sample |>
mutate(sample_site = ifelse(METASTATIC_SITE == "Not Applicable", PRIMARY_SITE, METASTATIC_SITE))
common_tumor_sites <- patient_sample|>
group_by(sample_site)|>
summarise(n = n())|>
filter(n>20)
patient_sample|>
filter(sample_site %in% common_tumor_sites$sample_site)|>
ggplot(aes(sample_site))+
geom_bar()+
labs(
title = "Number of Samples for Organs with more than 20 Samples"
)+
theme(axis.text.x = element_text(angle = 90, hjust=1, size = 10))Network of Metastasis
Create Network of where primary tumors have spread to as Metastasis.
nodes <- patient_sample |>
select(PRIMARY_SITE, METASTATIC_SITE)|>
mutate(to = ifelse(METASTATIC_SITE == "Not Applicable", NA, METASTATIC_SITE))|>
rename(from = PRIMARY_SITE) |>
select(-METASTATIC_SITE)|>
na.omit(to)|>
select(-to)|>
unique()
network_inf <- patient_sample |>
select(PRIMARY_SITE, METASTATIC_SITE)|>
mutate(to = ifelse(METASTATIC_SITE == "Not Applicable", NA, METASTATIC_SITE))|>
rename(from = PRIMARY_SITE) |>
select(-METASTATIC_SITE)|>
na.omit(to)|>
group_by(from, to)|>
summarise(n = n())`summarise()` has grouped output by 'from'. You can override using the
`.groups` argument.
node_labels <-c(unique(network_inf$from), unique(network_inf$to))
node_labels <- unique(node_labels)
g <- graph_from_data_frame(network_inf, directed = TRUE)
ggraph(g , layout = "stress")+
geom_edge_link(aes(alpha = n), arrow = arrow(length = unit(2.5, "mm")))+#
geom_node_point()+
geom_node_text(aes(label=node_labels), repel = TRUE, size = 3)+
labs(title = "Network of Primary Tumor sites connected to their Metastatic sites")Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
This is a highly connected Network. Some hubs can be seen and as represented with the strength of the arrow, it is visible that some primary tumor sides have common metastatic sites.
To get a better overview, the network is subsampled to primary sites which are common(>20 samples) and their metastatic sites.
network_common_tumor_sites <- network_inf|>
filter(from %in% common_tumor_sites$sample_site)
node_labels <- network_common_tumor_sites|>
select(-n)|>
pivot_longer(cols = c(from, to), values_to = "vertice")|>
select(-name)|>
distinct(vertice)|>
mutate(common_site = ifelse(vertice %in% common_tumor_sites$sample_site, TRUE, FALSE))
g_common <- graph_from_data_frame(network_common_tumor_sites)
ggraph(g_common , layout = "stress")+
geom_edge_link(aes(alpha = n), arrow = arrow(length = unit(2.5, "mm")))+
geom_node_point(aes(color = node_labels$common_site))+
geom_node_text(aes(label=node_labels$vertice), repel = TRUE, size = 3)+
labs(title = "Subsetted network for primary sites with over 20 Samples and their metastatic sites")This is still a big Network. The common tumor sites are only 16 nodes, but they have multiple metastatic sites. Some common tumor sites are also connected, so spread to each other.
The most common primary-metastatic sites are :
network_inf |>
arrange(desc(n)) |>
head(n = 10)# A tibble: 10 × 3
# Groups: from [4]
from to n
<chr> <chr> <int>
1 Lung Lymph Node 53
2 Skin Lymph Node, Regional 47
3 Lung Liver 32
4 Kidney Lung 24
5 Skin Lung 23
6 Bladder Lymph Node 21
7 Skin In-transit 21
8 Lung Pleura 17
9 Lung Brain 16
10 Skin Soft Tissue 16
Exploratory Analysis Glioma
Subset the dataset to only Glioma Samples
sample_glioma <- patient_sample |>
filter(CANCER_TYPE=="Glioma")nrow(sample_glioma)[1] 117
There are 117 Glioma samples.
sample_glioma |> group_by(SAMPLE_TYPE)|> summarise(n= n())# A tibble: 2 × 2
SAMPLE_TYPE n
<chr> <int>
1 Metastasis 2
2 Primary 115
The sampled gliomas are mainly primary tumors. This is to be expected since Gliomas rarely form Metastasis outside of the central nervous system[DOI: 10.1200/JCO.2013.48.7546].
Exploratory Plots
Some statistics plotted against the full dataset.
ggplot() +
geom_density(data = patient_sample,
aes(OS_MONTHS,
fill = "Full"),
alpha = 0.5)+
geom_density(data = sample_glioma,
aes(OS_MONTHS,
fill = "Glioma"),
alpha = 0.5)+
labs(
fill = "Dataset",
x = "Months of Survival",
title = "Survival Months per Dataset"
)ggplot() +
geom_bar(data = patient_sample,
aes(SEX, after_stat(count / sum(count)), fill = "Full"),
alpha = 0.5)+
geom_bar(data = sample_glioma,
aes(SEX, after_stat(count / sum(count)), fill = "Glioma"),
alpha = 0.5)+
labs(
y = "Proportion",
fill = "Dataset",
x = "Gender",
title = "Gender Distribution per Dataset"
)ggplot() +
geom_bar(data = patient_sample,
aes(OS_STATUS, after_stat(count / sum(count)), fill = "Full"),
alpha = 0.5)+
geom_bar(data = sample_glioma,
aes(OS_STATUS, after_stat(count / sum(count)), fill = "Glioma"),
alpha = 0.5)+
labs(
y = "Proportion",
fill = "Dataset",
x = "Survival Status",
title = "Survival Distribution per Dataset"
)ggplot() +
geom_bar(data = patient_sample,
aes(DRUG_TYPE, after_stat(count / sum(count)), fill = "Full"),
alpha = 0.5)+
geom_bar(data = sample_glioma,
aes(DRUG_TYPE, after_stat(count / sum(count)), fill = "Glioma"),
alpha = 0.5)+
labs(
y = "Proportion",
fill = "Dataset",
x = "Drug Type",
title = "Drug Type Distribution per Dataset"
)ggplot() +
geom_density(data = patient_sample|> filter(OS_STATUS == 1),
aes(OS_MONTHS, fill = "Full"),
alpha = 0.5)+
geom_density(data = sample_glioma |> filter(OS_STATUS == 1),
aes(OS_MONTHS, fill = "Glioma"),
alpha = 0.5)+
labs(
fill = "Dataset",
x = "Survival Months",
title = "Survival Distribution for diseased patients per Dataset"
)ggplot() +
geom_density(data = patient_sample|> filter(OS_STATUS == 0),
aes(OS_MONTHS, fill = "Full"),
alpha = 0.5)+
geom_density(data = sample_glioma |> filter(OS_STATUS == 0),
aes(OS_MONTHS, fill = "Glioma"),
alpha = 0.5)+
labs(
fill = "Dataset",
x = "Survival Months",
title = "Survival Distribution for alive patients per Dataset"
)ggplot() +
geom_density(data = patient_sample,
aes(SAMPLE_COVERAGE, fill = "Full"),
alpha = 0.5)+
geom_density(data = sample_glioma,
aes(SAMPLE_COVERAGE, fill = "Glioma"),
alpha = 0.5)+
labs(
fill = "Dataset",
x = "Sample Coverage",
title = "Sample Coverage Distribution per Dataset"
)ggplot() +
geom_density(data = patient_sample,
aes(as.numeric(TUMOR_PURITY), fill = "Full"),
alpha = 0.5)+
geom_density(data = sample_glioma,
aes(as.numeric(TUMOR_PURITY), fill = "Glioma"),
alpha = 0.5)+
labs(
fill = "Dataset",
x = "Tumor Purity",
title = "Tumor Purity Distribution per Dataset"
)Warning in FUN(X[[i]], ...): NAs introduced by coercion
Warning in FUN(X[[i]], ...): NAs introduced by coercion
Warning: Removed 65 rows containing non-finite outside the scale range
(`stat_density()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_density()`).
ggplot() +
geom_density(data = patient_sample,
aes(TMB_NONSYNONYMOUS, fill = "Full"),
alpha = 0.5)+
geom_density(data = sample_glioma,
aes(TMB_NONSYNONYMOUS, fill = "Glioma"),
alpha = 0.5)+
labs(
fill = "Dataset",
x = "TMB",
title = "TMB Distribution per Dataset"
)For some Variables the Subsetted dataset for Glioma samples is close to the one of the whole Dataset. It is noticable though that
a higher number of patients die
the tumor Purity of Gliomas is very high
the TMB is lower in comparison to the whole dataset
Look into the low TMB
sample_glioma |>
select(PATIENT_ID, SAMPLE_ID, TMB_NONSYNONYMOUS)|>
arrange(desc(TMB_NONSYNONYMOUS)) |>
head(n = 10)# A tibble: 10 × 3
PATIENT_ID SAMPLE_ID TMB_NONSYNONYMOUS
<chr> <chr> <dbl>
1 P-0010744 P-0010744-T01-IM5 98.9
2 P-0009411 P-0009411-T01-IM5 61.7
3 P-0013506 P-0013506-T01-IM5 55.8
4 P-0015573 P-0015573-T01-IM6 51.0
5 P-0009499 P-0009499-T01-IM5 50.9
6 P-0001420 P-0001420-T01-IM3 38.8
7 P-0001882 P-0001882-T01-IM3 34.4
8 P-0013029 P-0013029-T01-IM5 30.3
9 P-0002633 P-0002633-T01-IM3 22.2
10 P-0000749 P-0000749-T01-IM3 8.87
Plot for the more detailed cancer type of gliomas
sample_glioma |>
ggplot( aes(CANCER_TYPE_DETAILED))+
geom_bar()+
labs(title = "Detailed cancer type for glioma tumors")+
theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1, size = 10))Mutation Dataset
Library import
library(here)
library(tidyverse)
library(ggplot2)
library(table1)
Attaching package: 'table1'
The following objects are masked from 'package:base':
units, units<-
Data import
data_mutation <- read_tsv(here("data/_raw/tmb_mskcc_2018", "data_mutations.txt"),
na = c("", "NA", "NULL"),
show_col_types = FALSE)data_mutation |>
slice_sample(n = 5)# A tibble: 5 × 120
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 TP53 7157 MSKCC GRCh37 17 7577094
2 REL 5966 MSKCC GRCh37 2 61118941
3 ZFHX3 463 MSKCC GRCh37 16 72923647
4 TERT 7015 MSKCC GRCh37 5 1295250
5 FGF4 2249 MSKCC GRCh37 11 69589848
# ℹ 114 more variables: End_Position <dbl>, Strand <chr>, Consequence <chr>,
# Variant_Classification <chr>, Variant_Type <chr>, Reference_Allele <chr>,
# Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>, dbSNP_RS <chr>,
# dbSNP_Val_Status <lgl>, Tumor_Sample_Barcode <chr>,
# Matched_Norm_Sample_Barcode <lgl>, Match_Norm_Seq_Allele1 <lgl>,
# Match_Norm_Seq_Allele2 <lgl>, Tumor_Validation_Allele1 <lgl>,
# Tumor_Validation_Allele2 <lgl>, Match_Norm_Validation_Allele1 <lgl>, …
Data cleaning
NA handling
We begin by looking at all the columns with all nas in it.
all_na_columns <- data_mutation |>
summarise(across(.cols = everything(),
.fns = ~ all(is.na(.x)))) |>
pivot_longer(cols = everything(),
names_to = "column_name",
values_to = "is_all_na") |>
filter(is_all_na == TRUE) |>
pull(column_name)
print("Columns containing only NA values:")[1] "Columns containing only NA values:"
print(all_na_columns) [1] "dbSNP_Val_Status" "Matched_Norm_Sample_Barcode"
[3] "Match_Norm_Seq_Allele1" "Match_Norm_Seq_Allele2"
[5] "Tumor_Validation_Allele1" "Tumor_Validation_Allele2"
[7] "Match_Norm_Validation_Allele1" "Match_Norm_Validation_Allele2"
[9] "Verification_Status" "Sequencing_Phase"
[11] "Sequence_Source" "Validation_Method"
[13] "BAM_File" "Sequencer"
[15] "AA_MAF" "AFR_MAF"
[17] "AMR_MAF" "ASN_MAF"
[19] "Allele" "Amino_Acid_Change"
[21] "Amino_acids" "BIOTYPE"
[23] "CANONICAL" "CCDS"
[25] "CDS_position" "CLIN_SIG"
[27] "COMMENTS" "Comments"
[29] "DISTANCE" "DOMAINS"
[31] "EAS_MAF" "EA_MAF"
[33] "ENSP" "EUR_MAF"
[35] "ExAC_AF" "ExAC_AF_AFR"
[37] "ExAC_AF_AMR" "ExAC_AF_EAS"
[39] "ExAC_AF_FIN" "ExAC_AF_NFE"
[41] "ExAC_AF_OTH" "ExAC_AF_SAS"
[43] "Existing_variation" "FILTER"
[45] "Feature" "Feature_type"
[47] "GENE_PHENO" "GMAF"
[49] "Gene" "HGNC_ID"
[51] "HGVS_OFFSET" "HIGH_INF_POS"
[53] "IMPACT" "MA:FIS"
[55] "MA:FImpact" "MA:link.MSA"
[57] "MA:link.PDB" "MA:link.var"
[59] "MA:protein.change" "MINIMISED"
[61] "MOTIF_NAME" "MOTIF_POS"
[63] "MOTIF_SCORE_CHANGE" "PHENO"
[65] "PICK" "PUBMED"
[67] "PolyPhen" "SAS_MAF"
[69] "SIFT" "SOMATIC"
[71] "SWISSPROT" "SYMBOL"
[73] "SYMBOL_SOURCE" "TREMBL"
[75] "TSL" "Transcript"
[77] "UNIPARC" "VARIANT_CLASS"
[79] "all_effects" "amino_acid_change"
[81] "cDNA_Change" "cDNA_position"
[83] "cdna_change" "comments"
[85] "n_depth" "t_depth"
[87] "transcript"
sprintf("There are %i empty columns.",length(all_na_columns))[1] "There are 87 empty columns."
We drop all these columns:
data_mutation_without_nas <- data_mutation |>
select(-all_of(all_na_columns))
# Verify the result by comparing the number of columns
print(paste("Original columns:", ncol(data_mutation)))[1] "Original columns: 120"
print(paste("New columns:", ncol(data_mutation_without_nas)))[1] "New columns: 33"
The new data set looks like this
data_mutation_without_nas |>
slice_sample(n=5)# A tibble: 5 × 33
Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position
<chr> <dbl> <chr> <chr> <chr> <dbl>
1 EPHA7 2045 MSKCC GRCh37 6 94120377
2 PTPRD 5789 MSKCC GRCh37 9 8465608
3 ARID2 196528 MSKCC GRCh37 12 46245965
4 RB1 5925 MSKCC GRCh37 13 48937076
5 RUNX1 861 MSKCC GRCh37 21 36164627
# ℹ 27 more variables: End_Position <dbl>, Strand <chr>, Consequence <chr>,
# Variant_Classification <chr>, Variant_Type <chr>, Reference_Allele <chr>,
# Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>, dbSNP_RS <chr>,
# Tumor_Sample_Barcode <chr>, Validation_Status <chr>, Mutation_Status <chr>,
# Score <chr>, t_ref_count <dbl>, t_alt_count <dbl>, n_ref_count <dbl>,
# n_alt_count <dbl>, HGVSc <chr>, HGVSp <chr>, HGVSp_Short <chr>,
# Transcript_ID <chr>, RefSeq <chr>, Protein_position <dbl>, Codons <chr>, …
print("The resulting columns are:")[1] "The resulting columns are:"
colnames(data_mutation_without_nas) [1] "Hugo_Symbol" "Entrez_Gene_Id" "Center"
[4] "NCBI_Build" "Chromosome" "Start_Position"
[7] "End_Position" "Strand" "Consequence"
[10] "Variant_Classification" "Variant_Type" "Reference_Allele"
[13] "Tumor_Seq_Allele1" "Tumor_Seq_Allele2" "dbSNP_RS"
[16] "Tumor_Sample_Barcode" "Validation_Status" "Mutation_Status"
[19] "Score" "t_ref_count" "t_alt_count"
[22] "n_ref_count" "n_alt_count" "HGVSc"
[25] "HGVSp" "HGVSp_Short" "Transcript_ID"
[28] "RefSeq" "Protein_position" "Codons"
[31] "Hotspot" "ALLELE_NUM" "IS_NEW"
However, there are still columns with some NAs in it -> what to do about it?
na_containing_columns <- data_mutation_without_nas |>
summarise(across(
everything(),
.fns = ~ sum(is.na(.x))
)) |>
pivot_longer(
everything(),
names_to = "column_name",
values_to = "na_count"
) |>
filter(na_count > 0)
na_containing_columns# A tibble: 10 × 2
column_name na_count
<chr> <int>
1 Tumor_Seq_Allele1 2
2 dbSNP_RS 20026
3 HGVSc 518
4 HGVSp 1243
5 HGVSp_Short 518
6 RefSeq 1503
7 Protein_position 531
8 Codons 1239
9 ALLELE_NUM 14176
10 IS_NEW 14289
We see that dbSNP_RS is almost only NAs (20026 over 20563), thus we can drop this column. Apart from that, the only variables that have a significant number of NAs are ALLELE_NUM ans IS_NEW.
data_mutation_without_nas <- data_mutation_without_nas |>
select(-c("dbSNP_RS"))Explanation of the variables
| Column name | What it is |
|---|---|
| Hugo_Symbol | The Name of the gene. |
| Entrez_Gene_Id | The gene’s unique ID number. |
| Chromosome | The specific chromosome where the mutation occurred |
| Start_Position | The first base pair coordinate of the mutation on the chromosome. |
| End_Position | The last base pair coordinate of the mutation. (Often the same as Start_Position for single-base changes). |
| Strand | Indicates whether the gene is read on the positive (+) or negative (-) strand of the DNA double helix. |
| Center | The institution or center that submitted the sequencing data (e.g., MSKCC for Memorial Sloan Kettering Cancer Center). |
| NCBI_Build | The specific version of the human genome reference sequence used for mapping (e.g., GRCh37 is a common older version). |
| Transcript_ID | The ID of the specific mRNA transcript (from the gene) that was analyzed. |
| RefSeq | The reference sequence accession number for the transcript, used for high-quality sequence reporting. |
| Reference_Allele | The normal (wild-type) base(s) found at this position in the reference genome. |
| Tumor_Seq_Allele1 | The first DNA sequence read from the tumor sample. (Often the reference allele, especially if it’s a heterozygous mutation). |
| Tumor_Seq_Allele2 | The second DNA sequence read from the tumor sample, which is often the mutated allele. |
| Variant_Type | Describes the physical nature of the change: SNP (Single Nucleotide Polymorphism - one base changed), DEL (Deletion), INS (Insertion), etc. |
| Variant_Classification | Describes the functional impact of the mutation on the gene’s function: Missense_Mutation (changes one amino acid), Nonsense_Mutation (creates a stop signal), Frame_Shift_Del (shifts the reading frame). |
| Consequence | A technical, detailed description of the mutation’s location and effect (e.g., missense_variant, splice_region_variant). |
| HGVSc | HGVS notation describing the change at the cDNA level (e.g., c.839G>A). (The Code Change in the mRNA message.) |
| HGVSp | HGVS notation describing the change at the protein level (e.g., p.Arg280Lys). This shows which amino acid was changed and what it became. |
| HGVSp_Short | A short version of the protein change (e.g., p.R280K). |
| Protein_position | The position number of the amino acid affected in the protein sequence. |
| Codons | The triplet of DNA bases that codes for the amino acid, showing the change (e.g., Cgg/Tgg). |
| t_ref_count | The number of times the reference allele (normal base) was observed in the tumor sample reads. |
| t_alt_count | The number of times the alternate allele (mutant base) was observed in the tumor sample reads |
| n_ref_count | The number of times the reference allele was observed in the matched normal (non-tumor) sample reads. |
| n_alt_count | The number of times the alternate allele (mutant base) was observed in the matched normal sample reads. |
| Mutation_Status | Indicates if the mutation is SOMATIC (acquired in the tumor, not inherited) or GERMLINE (inherited and present in all cells). Somatic is expected in cancer data. |
| Validation_Status | Indicates if the mutation was confirmed by a secondary method (like an independent test). Unknown means it was not specifically validated. |
| dbSNP_RS | The identifier (rs ID) from the dbSNP database if this change is a known polymorphism in the general population. If this is present, the mutation might not be cancer-specific. |
| Hotspot | A simple flag (0 or 1) indicating if this specific amino acid position is known to be frequently mutated across many cancer types. |
| ALLELE_NUM | The numerical index for the alternate allele (used internally in some formats). |
| IS_NEW | Indicates whether this specific mutation was previously seen in the MSK-IMPACT cohort. |
| Score | A general quality score or ranking of the mutation, which can vary depending on the analysis pipeline. |
| Tumor_Sample_Barcode | A unique, standardized identifier that links that specific mutation event back to the original tissue sample taken from a specific patient. |
It seems that ALLELE_NUM is an internal indicator, so we wouldn’t use it, therefor we’ll drop it.
data_mutation_without_nas <- data_mutation_without_nas |>
select(-c("ALLELE_NUM"))For IS_NEW, we’ll check if NA corresponds to a not-new allele, in this case we would just infere the NAs to be null or 0.
is_new_values <- data_mutation_without_nas |>
summarize(unique_is_new = unique(IS_NEW))Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
print("The values in the IS_NEW column are:")[1] "The values in the IS_NEW column are:"
is_new_values # A tibble: 2 × 1
unique_is_new
<chr>
1 <NA>
2 NEWRECORD
So indeed the NAs here means that it’s not new. So we will replace NAs by NOTNEW, and then input NOTNEW by 0 and NEWRECORD by 1 (hotspot encoding is easier to use in code).
data_mutation_without_nas <- data_mutation_without_nas |>
mutate(IS_NEW = case_when(
is.na(IS_NEW) ~ 0,
IS_NEW == "NEWRECORD" ~ 1
))For the rest of the column with NAs, we will keep all their rows and just using na.rm = TRUE when using the columns (we can do inference bc they’re not statistical or calculated variable).
We will search all columns with constant value to also drop them as we already know their values.
is_single_value <- function(x) {
n_distinct(x) == 1
}
single_value_columns <- data_mutation_without_nas |>
summarise(across(everything(), is_single_value)) |>
pivot_longer(everything(), names_to = "column_name", values_to = "is_single_value") |>
filter(is_single_value == TRUE) |>
pull(column_name)
single_value_columns[1] "Center" "NCBI_Build" "Strand"
[4] "Validation_Status" "Score" "Hotspot"
So we can drop all these columns knowing their constant values.
data_mutation_without_nas <- data_mutation_without_nas |>
select(-c("Center","NCBI_Build","Strand","Validation_Status","Hotspot","Score"))is_factor <- function(x){
is_char <- is.character(x)
bool <- n_distinct(x)<=40
return(is_char & bool)
}
column_to_factor <- data_mutation_without_nas |>
select(where(is_factor)) |>
names()
print("The column to factor are:")[1] "The column to factor are:"
column_to_factor[1] "Chromosome" "Consequence" "Variant_Classification"
[4] "Variant_Type" "Mutation_Status"
Lot of NAs in ALLELE_NUM and IS_NEW, how to deal with?
data_mutation_without_nas <- data_mutation_without_nas |>
mutate(across(all_of(column_to_factor), factor))We see that for some rows there are several consequences -> we split them into several columns. First lets check how many na is there in it.
consequence_na <- data_mutation_without_nas |>
summarise(consequence_na = sum(is.na(Consequence))) |>
pull(consequence_na)
sprintf("There is %i nas in the consequence column.", as.integer(consequence_na))[1] "There is 0 nas in the consequence column."
data_mutation_without_nas <- data_mutation_without_nas |>
mutate(n_consequences = str_count(Consequence,",")+1)max_consequences <- data_mutation_without_nas$n_consequences |>
max()
sprintf("The max number of consequences for one row is %.4f",max_consequences)[1] "The max number of consequences for one row is 4.0000"
data_mutation_without_nas |>
count(n_consequences) |>
ggplot(aes(x = n_consequences,
y = n)) +
geom_col(fill = "grey") +
theme_minimal() +
labs(x = "Number of consequences per observation")The vast majority observes only one consequence for their change in DNA.
data_mutation_without_nas_wide <- data_mutation_without_nas |>
separate(col = "Consequence",
into = str_c("consequence_", 1:max_consequences),
sep = ",",
remove = TRUE)Warning: Expected 4 pieces. Missing pieces filled with `NA` in 20560 rows [1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
data_mutation_without_nas_long <- data_mutation_without_nas_wide |>
pivot_longer(
cols = contains("consequence_"),
names_to = NULL,
values_to = "consequence"
) |>
drop_na("consequence") |>
mutate(
consequence = as.factor(consequence)
)
data_mutation_without_nas_long |>
sample_n(5)# A tibble: 5 × 26
Hugo_Symbol Entrez_Gene_Id Chromosome Start_Position End_Position
<chr> <dbl> <fct> <dbl> <dbl>
1 PREX2 80243 8 69031672 69031672
2 ATRX 546 X 76918967 76918967
3 TP53 7157 17 7578406 7578406
4 PBRM1 55193 3 52637689 52637689
5 FGFR4 2264 5 176520140 176520141
# ℹ 21 more variables: Variant_Classification <fct>, Variant_Type <fct>,
# Reference_Allele <chr>, Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>,
# Tumor_Sample_Barcode <chr>, Mutation_Status <fct>, t_ref_count <dbl>,
# t_alt_count <dbl>, n_ref_count <dbl>, n_alt_count <dbl>, HGVSc <chr>,
# HGVSp <chr>, HGVSp_Short <chr>, Transcript_ID <chr>, RefSeq <chr>,
# Protein_position <dbl>, Codons <chr>, IS_NEW <dbl>, n_consequences <dbl>,
# consequence <fct>
data_mutation_without_nas_long <- data_mutation_without_nas_long |>
mutate(patient_id = str_extract(Tumor_Sample_Barcode, "^P-\\d{7}")) |>
relocate(c(patient_id,Tumor_Sample_Barcode),
.before = Entrez_Gene_Id)data_mutation_without_nas_long |>
sample_n(5)# A tibble: 5 × 27
Hugo_Symbol patient_id Tumor_Sample_Barcode Entrez_Gene_Id Chromosome
<chr> <chr> <chr> <dbl> <fct>
1 KMT2D P-0014418 P-0014418-T01-IM6 8085 12
2 ERBB4 P-0005792 P-0005792-T01-IM5 2066 2
3 EPHA3 P-0021774 P-0021774-T01-IM6 2042 3
4 H3F3B P-0008328 P-0008328-T01-IM5 3021 17
5 CTNNB1 P-0004348 P-0004348-T01-IM5 1499 3
# ℹ 22 more variables: Start_Position <dbl>, End_Position <dbl>,
# Variant_Classification <fct>, Variant_Type <fct>, Reference_Allele <chr>,
# Tumor_Seq_Allele1 <chr>, Tumor_Seq_Allele2 <chr>, Mutation_Status <fct>,
# t_ref_count <dbl>, t_alt_count <dbl>, n_ref_count <dbl>, n_alt_count <dbl>,
# HGVSc <chr>, HGVSp <chr>, HGVSp_Short <chr>, Transcript_ID <chr>,
# RefSeq <chr>, Protein_position <dbl>, Codons <chr>, IS_NEW <dbl>,
# n_consequences <dbl>, consequence <fct>
data_mutation_without_nas_long |>
count(Variant_Classification) |>
ggplot(aes(x = Variant_Classification,
y = n)) +
geom_col(fill = "grey") +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(
angle = 30,
hjust = 1
)) +
labs(x = "Count of the different variant classifications") data_mutation_without_nas_long |>
ggplot(aes( x = t_ref_count
)) +
geom_boxplot()data_mutation_without_nas_long |>
ggplot(aes( x = t_alt_count
)) +
geom_boxplot()Gene Panel Dataset
Load libraries:
library(tidyverse)
library(here)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)Data wrangling:
Load the Gene Panel Matrix dataset and select the SAMPLE_ID and IMPACT_version columns.
#Load the gene panel matrix data
gene_panel_matrix <- read_tsv(here("data",
"patient_sample_gene_panel.tsv"))Rows: 1661 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (19): PATIENT_ID, SEX, AGE_GROUP, DRUG_TYPE, SAMPLE_ID, CANCER_TYPE, SAM...
dbl (6): OS_MONTHS, OS_STATUS, SEX_NUM, SAMPLE_COVERAGE, AGE_AT_SEQ_REPORT,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Select the column of interest
gene_panel_matrix <- gene_panel_matrix |>
select(SAMPLE_ID,
IMPACT_version)
gene_panel_matrix# A tibble: 1,661 × 2
SAMPLE_ID IMPACT_version
<chr> <chr>
1 P-0000057-T01-IM3 IMPACT_v3
2 P-0000062-T01-IM3 IMPACT_v3
3 P-0000063-T01-IM3 IMPACT_v3
4 P-0000071-T01-IM3 IMPACT_v3
5 P-0000082-T01-IM3 IMPACT_v3
6 P-0000088-T01-IM3 IMPACT_v3
7 P-0000103-T01-IM3 IMPACT_v3
8 P-0000121-T01-IM3 IMPACT_v3
9 P-0000165-T01-IM3 IMPACT_v3
10 P-0000184-T01-IM3 IMPACT_v3
# ℹ 1,651 more rows
Load the Mutations dataset.
#Load the mutations data
data_mutation <- read_tsv(here("data",
"data_mutation_cleaned.tsv"),
show_col_types = FALSE)Some rows in the data_mutation object are duplicates that differ only in the “consequence” column. Given that the consequence is not going to be studied, these duplicate rows will be removed.
#Remove the duplicates
data_mutation <- data_mutation |>
distinct(across(-consequence),
.keep_all = TRUE)Rename some columns to keep the same format.
#Rename the ID column and the gene column
data_mutation <- data_mutation |>
rename(SAMPLE_ID = Tumor_Sample_Barcode,
GENE = Hugo_Symbol)Select the relevant columns from the data_mutation dataset and merge both datasets by SAMPLE_ID.
#Select the columns of interest
data_mutation <- data_mutation |>
select(SAMPLE_ID,
GENE,
Variant_Classification)
#Merge both objects in a new one
combined_data <- left_join(gene_panel_matrix,
data_mutation,
by = "SAMPLE_ID")Organize the data by grouping it by SAMPLE_ID, ensuring that each row corresponds to a single sample with a tibble listing the mutated genes. Then, calculate the number of mutations per patient.
#Group the data by the Sample ID and count the number of mutations per patient
combined_data <- combined_data |>
group_by(SAMPLE_ID) |>
summarise(
IMPACT_version = first(IMPACT_version),
n_mutation = n(),
mutations = list(select(cur_data(),
GENE,
Variant_Classification))
)Warning: There was 1 warning in `summarise()`.
ℹ In argument: `mutations = list(select(cur_data(), GENE,
Variant_Classification))`.
ℹ In group 1: `SAMPLE_ID = "P-0000057-T01-IM3"`.
Caused by warning:
! `cur_data()` was deprecated in dplyr 1.1.0.
ℹ Please use `pick()` instead.
combined_data# A tibble: 1,661 × 4
SAMPLE_ID IMPACT_version n_mutation mutations
<chr> <chr> <int> <list>
1 P-0000057-T01-IM3 IMPACT_v3 5 <tibble [5 × 2]>
2 P-0000062-T01-IM3 IMPACT_v3 6 <tibble [6 × 2]>
3 P-0000063-T01-IM3 IMPACT_v3 13 <tibble [13 × 2]>
4 P-0000071-T01-IM3 IMPACT_v3 10 <tibble [10 × 2]>
5 P-0000082-T01-IM3 IMPACT_v3 12 <tibble [12 × 2]>
6 P-0000088-T01-IM3 IMPACT_v3 12 <tibble [12 × 2]>
7 P-0000103-T01-IM3 IMPACT_v3 6 <tibble [6 × 2]>
8 P-0000121-T01-IM3 IMPACT_v3 3 <tibble [3 × 2]>
9 P-0000165-T01-IM3 IMPACT_v3 5 <tibble [5 × 2]>
10 P-0000184-T01-IM3 IMPACT_v3 8 <tibble [8 × 2]>
# ℹ 1,651 more rows
Calculate the percentage of each mutation for each IMPACT version. Group the less represented mutations into broader categories to improve the final plots clarity.
#Create a new object with the percentages of each type of mutation per
#IMPACT version
percentage_type <- combined_data |>
select(IMPACT_version,
mutations) |>
#It is necessary to temporary unnest to work with the nested tibble data
unnest(mutations)
#Group some mutation types to reduce the number of different categories
percentage_type <- percentage_type |>
#Combine Frame shift del and ins into "Frame_Shift_Mutation"
mutate(Variant_Classification = case_when(
Variant_Classification %in% c(
"Frame_Shift_Del",
"Frame_Shift_Ins") ~ "Frame_Shift_Mutation",
TRUE ~ Variant_Classification)) |>
#Group the less represented mutations into "Other"
mutate(Variant_Classification = ifelse(
Variant_Classification %in% c("Frame_Shift_Mutation",
"Missense_Mutation",
"Nonsense_Mutation",
"Splice_Site"),
Variant_Classification,
"Other"
)) |>
#Calculate the percentage
count(IMPACT_version,
Variant_Classification) |>
group_by(IMPACT_version) |>
mutate(percentage = n / sum(n) * 100)
#Order the categories for a better representation
percentage_type <- percentage_type |>
mutate(Variant_Classification = factor(Variant_Classification,
levels = c("Missense_Mutation",
"Nonsense_Mutation",
"Frame_Shift_Mutation",
"Splice_Site",
"Other")))Data representation
Number of mutations per IMPACT version
A representation of the number of mutations detected by the IMPACT version is done in order to assess whether the latest versions of IMPACT demonstrate improved capacity for identifying new mutations.
mutation_count_IMPACT <- ggplot(data = combined_data,
mapping = aes(x = IMPACT_version,
y = n_mutation,
fill = IMPACT_version)) +
geom_boxplot(alpha = 0.5) +
#It is necessary to do the log10 of the y values to
#prevent outliers from hiding the main distribution
scale_y_log10() +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
labs(title = "Number of mutations per IMPACT version (log scale)",
y = "Mutations (log10)",
x = "IMPACT version")
mutation_count_IMPACTAs shown in the plot, the overall number of mutations detected increases with higher IMPACT versions, reflecting that a greater number of studied genes corresponds to a higher number of detected mutations. The medians of detected mutations are the following:
results <- combined_data |>
group_by(IMPACT_version) |>
summarise(mut_median = median(n_mutation))
results# A tibble: 3 × 2
IMPACT_version mut_median
<chr> <int>
1 IMPACT_v3 5
2 IMPACT_v5 6
3 IMPACT_v6 7
Distribution of mutation type by IMPACT version
ggplot(data = percentage_type,
mapping = aes(x = IMPACT_version,
y = percentage,
fill = Variant_Classification)) +
geom_col(alpha = 0.8) +
theme_minimal() +
theme(panel.grid.major.x = element_blank()) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
labs(title = "Distribution of mutation type by IMPACT version",
y = "Percentage",
x = "IMPACT version",
fill = "Mutation type")While newer IMPACT versions detect more mutations overall, the distribution of mutation types remains largely unchanged. The most significant change is the proportion of Frameshift mutations, which increases with each IMPACT version.
This indicates that version upgrades increase sensitivity without substantially altering the mutation distribution.
Saving figures:
ggsave("../results/figures/mutation_count_IMPACT.png",
plot = mutation_count_IMPACT)Saving 7 x 5 in image
Structural Variance Dataset
Import library
library(tidyverse)
library(ggplot2)
library(here)
library(viridis)Import data
data_sv <- read_tsv(here("data", "sv_cleaned.tsv"))Rows: 346 Columns: 28
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (16): Sample_Id, Site1_Hugo_Symbol, Site1_Region, Site1_Chromosome, Site...
dbl (12): Site1_Region_Number, Site1_Position, Site2_Region_Number, Site2_Po...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_patient_sample <- read_tsv(here("data", "patient_sample_gene_panel.tsv"))Rows: 1661 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (19): PATIENT_ID, SEX, AGE_GROUP, DRUG_TYPE, SAMPLE_ID, CANCER_TYPE, SAM...
dbl (6): OS_MONTHS, OS_STATUS, SEX_NUM, SAMPLE_COVERAGE, AGE_AT_SEQ_REPORT,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_sv |> slice_sample(n = 5)# A tibble: 5 × 28
Sample_Id Site1_Hugo_Symbol Site1_Region Site1_Region_Number Site1_Chromosome
<chr> <chr> <chr> <dbl> <chr>
1 P-0001710… APC Exon 16 5
2 P-0020169… TEK Intron NA 9
3 P-0010238… APC Exon 16 5
4 P-0010004… EGFR Intron NA 7
5 P-0003132… KMT2D 3_Prime_UTR NA 12
# ℹ 23 more variables: Site1_Position <dbl>, Site1_Description <chr>,
# Site2_Hugo_Symbol <chr>, Site2_Region <chr>, Site2_Region_Number <dbl>,
# Site2_Chromosome <chr>, Site2_Position <dbl>, Site2_Description <chr>,
# Site2_Effect_On_Frame <chr>, Class <chr>, Breakpoint_Type <chr>,
# Connection_Type <chr>, Event_Info <chr>, Annotation <chr>, Comments <chr>,
# Normal_Read_Count <dbl>, Normal_Variant_Count <dbl>,
# Tumor_Paired_End_Read_Count <dbl>, Tumor_Split_Read_Count <dbl>, …
data_patient_sample |> slice_sample(n = 5)# A tibble: 5 × 25
PATIENT_ID SEX OS_MONTHS OS_STATUS AGE_GROUP DRUG_TYPE SEX_NUM SAMPLE_ID
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr>
1 P-0017602 Female 1 1 31-50 Combo 1 P-0017602-…
2 P-0001693 Female 2 1 50-60 PD-1/PDL-1 1 P-0001693-…
3 P-0005053 Male 36 0 50-60 Combo 0 P-0005053-…
4 P-0003628 Male 5 1 61-70 PD-1/PDL-1 0 P-0003628-…
5 P-0008340 Female 5 1 31-50 PD-1/PDL-1 1 P-0008340-…
# ℹ 17 more variables: CANCER_TYPE <chr>, SAMPLE_TYPE <chr>,
# SAMPLE_CLASS <chr>, METASTATIC_SITE <chr>, PRIMARY_SITE <chr>,
# CANCER_TYPE_DETAILED <chr>, GENE_PANEL <chr>, SAMPLE_COVERAGE <dbl>,
# TUMOR_PURITY <chr>, ONCOTREE_CODE <chr>, INSTITUTE <chr>,
# SOMATIC_STATUS <chr>, AGE_AT_SEQ_REPORT <dbl>, TMB_NONSYNONYMOUS <dbl>,
# mutations <chr>, structural_variants <chr>, IMPACT_version <chr>
Data augmentation
The clinical data contains 1.6k rows, where each entry represents a unique patient with a unique patient ID and a unique sample ID. In contrast, the Structural Variants (SV) data only has 346 rows, and the sample ID column is not a primary key: a single patient may have multiple entries, each representing a different SV.
Therefore, we can augment our clinical data in two distinct ways: either by looking at the SVs separately (resulting in the sv_clinic_augm dataset), or by aggregating and adding the number of each SV class per patient (resulting in the clinical_with_sv dataset).
sv_clinic_augm <- data_sv |> left_join(data_patient_sample, by = c("Sample_Id" = "SAMPLE_ID"))
sv_per_patient <- data_sv |>
group_by(Sample_Id) |>
summarize(Total_SV = n(),
N_Deletions = sum(Class == "DELETION"),
N_Duplications = sum(Class == "DUPLICATION"),
N_Inversions = sum(Class == "INVERSION"),
N_Translocations = sum(Class == "TRANSLOCATION")
)
clinical_with_sv <- data_patient_sample |>
left_join(sv_per_patient, by = c("SAMPLE_ID" = "Sample_Id")) |>
mutate(Total_SV = replace_na(Total_SV, 0))Exploratory analysis
Distribution of structural variant class
sv_clinic_augm |> ggplot(aes(x = Class, fill = Class))+
geom_bar(color = "black", alpha = 0.9) +
theme_minimal()+
scale_fill_viridis(option = "viridis",
discrete = TRUE)+
theme(panel.grid.major.x = element_blank()) +
labs(x = "Structural variant class", y =" counts")Distribution of SVs length:
We omitted the translocation class here, as it lacks a determined size and was assigned a length of 0 in the dataset.
SVs_length_distrib <- sv_clinic_augm |> filter(Class != "TRANSLOCATION") |>
ggplot(aes(x=SV_Length, fill = Class)) +
geom_histogram(position = "identity", alpha = 0.8) +
scale_x_log10() +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() +
labs(
x = 'SV Length (Log10 scale)',
y = 'Count',
fill = 'SV Class',
title = 'Distribution of Structural Variant lengths by Class',
subtitle = 'Translocations excluded'
)
SVs_length_distrib`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
Even if deletion are over-represented, the few inversion are extremely long. We can also see the distribution of lengths within each class by using a density plot
sv_clinic_augm |> filter(Class != "TRANSLOCATION") |>
ggplot(aes(x = SV_Length, fill = Class, color = Class)) +
geom_density(alpha = 0.5) +
scale_x_log10() +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
scale_color_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() +
labs(
x = 'SV Length (Log10 scale)',
y = 'density',
fill = 'SV Class',
title = 'Density distribution of Structural Variant lengths by Class',
subtitle = 'Translocations excluded'
) Warning: Removed 1 row containing non-finite outside the scale range
(`stat_density()`).
This clearly highlights the disparities in lenghts per class.
Class and number of SVs per cancer type
SVs_per_cancer <- sv_clinic_augm |>
group_by(CANCER_TYPE) |>
ggplot(mapping = aes(x = CANCER_TYPE, fill = Class)) +
geom_bar(color = "black", linewidth = 0.2, alpha = 0.9) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
size = 7)) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme(panel.grid.major.x = element_blank()) +
labs(title = "Distribution of SV Class by Cancer Type",
x = "Cancer Type",
y = "SV Count")
SVs_per_cancerExploration of link between SVs, TMB and all cancer types
While TMB captures nucleotide-level errors, SVs represent structural genomic instability. Here, we explore the absence or presence of a link between those 2 genomic modifications, considering each patient separately (clinical_with_sv).
# clinical_with_sv |>
# mutate(Has_SV = ifelse(Total_SV > 0, "With SV", "No SV")) |>
# ggplot(aes(x = SAMPLE_TYPE, fill = Has_SV)) +
# geom_bar(position = "fill") +
# theme_minimal() +
# labs(title = "Are Structural Variants enriched in Metastases?",
# y = "Proportion of Patients",
# x = NULL,
# fill = "SV Status")
clinical_with_sv |>
mutate(Has_SV = ifelse(Total_SV > 0, "With SV", "No SV")) |>
ggplot(aes(x = SAMPLE_TYPE, fill = Has_SV)) +
geom_bar(position = "fill") +
scale_y_continuous() +
coord_flip() +
facet_wrap(~CANCER_TYPE, ncol = 2) +
theme_minimal() +
labs(title = "Are Structural Variants enriched in Metastases?",
subtitle = "Comparison by Cancer Type",
y = "Proportion of Patients",
x = NULL,
fill = "SV Status")clinical_with_sv |> mutate(CANCER_TYPE = fct_reorder(CANCER_TYPE, TMB_NONSYNONYMOUS, .fun = median)) |>
ggplot(aes(x = CANCER_TYPE, y = TMB_NONSYNONYMOUS)) +
geom_boxplot(fill = "orange", alpha = 0.6) +
scale_y_log10() +
coord_flip() +
theme_minimal() +
labs(title = "Tumor Mutational Burden (TMB) across Cancer Types",
x = NULL,
y = "TMB") +
theme_minimal()Warning in scale_y_log10(): log-10 transformation introduced infinite values.
Warning: Removed 55 rows containing non-finite outside the scale range
(`stat_boxplot()`).
There are no specific cancer types with an unusually high TMB. Maybe primary sites and metastases share different TMB?
clinical_with_sv |>
ggplot(aes(x = SAMPLE_TYPE, y = TMB_NONSYNONYMOUS, fill = SAMPLE_TYPE)) +
geom_boxplot(alpha = 0.8) +
scale_y_log10() +
facet_wrap(~CANCER_TYPE, scales = "free_y") +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Comparison of TMB in Primary vs. Metastasis samples per cancer type",
subtitle = "",
x = NULL,
y = "TMB (log scale)")Warning in scale_y_log10(): log-10 transformation introduced infinite values.
Warning: Removed 55 rows containing non-finite outside the scale range
(`stat_boxplot()`).
clinical_with_sv |> mutate(CANCER_TYPE = fct_reorder(CANCER_TYPE, Total_SV, .fun = mean)) |>
ggplot(aes(x = CANCER_TYPE, y = Total_SV)) +
geom_jitter(alpha = 0.3, color = "darkblue") +
coord_flip() +
labs(title = "Structural Variants per Cancer Type", x = NULL, y = "SVs Counts") +
theme_minimal()The recorded SVs are too sparse to represent a substantial part of any cancer type.
Maybe we can try to see if some cancers are SV-focused, or TMB focused?
clinical_with_sv |>
ggplot(aes(x = TMB_NONSYNONYMOUS, y = Total_SV)) +
#geom_point(aes(color = CANCER_TYPE), alpha = 0.6) +
geom_jitter(aes(color = CANCER_TYPE), alpha = 0.6) +
geom_smooth(method = "lm", color = "black", se = FALSE) +
scale_x_log10() +
theme_minimal() +
labs(title = "Correlation: TMB vs. SV",
x = "TMB",
y = "Number of SVs")Warning in scale_x_log10(): log-10 transformation introduced infinite values.
log-10 transformation introduced infinite values.
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 55 rows containing non-finite outside the scale range
(`stat_smooth()`).
The lack of sequenced data for the SVs presents a severe limitation for this analysis.
Exporting plots
if(!dir.exists(here("results"))) dir.create(here("results"))
if(!dir.exists(here("results",'figures'))) dir.create(here("results", 'figures'))
ggsave(here('results','figures', 'SV_count_per_cancer.png'), SVs_per_cancer)Saving 7 x 5 in image
ggsave(here('results','figures', 'SV_lengths_distribution.png'), SVs_length_distrib)Saving 7 x 5 in image
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
Downstream Analysis
These scripts perform more particular Analysis to follow findings from the exploratory analysis.
quarto::quarto_render(here("R", "05_Survival_rate_parameter_analysis.qmd"))
quarto::quarto_render(here("R", "06_mutation_types.qmd"))
quarto::quarto_render(here("R", "07_EGFR_SV_analysis.qmd"))
quarto::quarto_render(here("R", "08_Cox_Regression.qmd"))
quarto::quarto_render(here("R", "09_gene_enrichment_analysis.qmd"))Survival Rate Analysis
Loading Libraries
library('tidyverse')
library('survival')
library('here')
library("viridis")Loading Data
patients_sample_gene_panel <- read_tsv(here("data", "patient_sample_gene_panel.tsv"))Rows: 1661 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (19): PATIENT_ID, SEX, AGE_GROUP, DRUG_TYPE, SAMPLE_ID, CANCER_TYPE, SAM...
dbl (6): OS_MONTHS, OS_STATUS, SEX_NUM, SAMPLE_COVERAGE, AGE_AT_SEQ_REPORT,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data Wrangling
Selecting our columns of interest (originally form clinical patient and clinical sample files).
patients_sample <- patients_sample_gene_panel |>
select(c(SEX, OS_MONTHS, OS_STATUS, AGE_GROUP, DRUG_TYPE, CANCER_TYPE, TMB_NONSYNONYMOUS, AGE_AT_SEQ_REPORT))Analysis
Looking at both patients and sample, it can be interesting to look at various variables.
Starting with the Tumor Mutational Burden (TMB), is there a correlation between the drug given and the TMB?
#Comparing the number of mutation, and the drug given, in the case of Glioma cancer
patients_sample <- patients_sample |>
mutate(TMB_group = case_when(TMB_NONSYNONYMOUS<5 ~'Low TMB',
5<=TMB_NONSYNONYMOUS & TMB_NONSYNONYMOUS<20 ~'Intermediate TMB',
TMB_NONSYNONYMOUS>=20 ~'High TMB'
) |>
factor(levels = c('Low TMB', 'Intermediate TMB', 'High TMB'),
ordered = TRUE
),
AGE_GROUP = factor(AGE_GROUP,
levels = c("<30", "31-50", "50-60", "61-70", ">71"),
ordered = TRUE
)
)
patients_sample |>
ggplot(mapping = aes(x = TMB_group,
fill = DRUG_TYPE)
)+
geom_bar()+
labs(title = 'Bar plot of Drug given againts different severity of TMB', x = 'TMB Severity', fill = 'Drug Type')+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha = 0.7) Analysis by Cancer Type
Survival rate analysis
Does the cancer type affects the survival of patients?
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~CANCER_TYPE, data = patients_sample)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata), KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "CANCER_TYPE="))
survival_cancer_type <- ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis(option = "viridis",
discrete = TRUE) +
coord_cartesian(ylim = c(0,NA))+
labs(title = 'Survival Curve of the patients depending on their Cancer type',
x = 'Time (in months)',
y = 'Overall Survival Probability',
color = 'Cancer Type')+
theme_minimal()
survival_cancer_typeThe cancer definitly impacts the survival probability, it can be interesting to look more into the different cancer types.
Further analysis of patients based on their cancer type
Looking at the ages when different cancers were diagnosed.
ggplot(patients_sample,
mapping = aes(x = AGE_AT_SEQ_REPORT,
fill = CANCER_TYPE))+
geom_bar()+
scale_fill_viridis(option = "viridis",
discrete = TRUE)+
labs(title = 'Repartition of cancers at the different diagnosis ages',
x ='Age at Diagnosis',
fill = 'Cancer Type')+
theme_minimal()Warning: Removed 1 row containing non-finite outside the scale range
(`stat_count()`).
Are there cancers that lead to higher TMB?
ggplot(patients_sample,
mapping = aes(x = factor(TMB_group, order = TRUE),
fill = CANCER_TYPE))+
geom_bar()+
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
labs(title = 'Proportion of cancer types depending on their TMB severity',
x = 'TMB severity',
fill = 'Cancer Type')+
theme_minimal()Repartition of cancers by gender:
ggplot(patients_sample,
mapping = aes(x = CANCER_TYPE,
fill = SEX))+
geom_bar()+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha =0.7) +
labs(title = 'Proportion of males and females for each cancer type',
x ='Cancer Type',
fill = ' Gender')+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))Are there drugs more given to a certain type of cancer?
ggplot(patients_sample,
mapping = aes(x = CANCER_TYPE,
fill = DRUG_TYPE))+
geom_bar()+
labs(title = 'Proportion of the type of drug given for each Cancer Type',
x = 'cancer Type',
fill = 'Drug Type')+
scale_fill_viridis(option = "viridis",
discrete = TRUE,
alpha = 0.7) +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))Survival Analysis
the impact of TMB on all cancer survival rate:
Does the tumor mutational burden (liked to the number of mutation) affect the survival and thus is an indicator of the severity of the cancer?
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~TMB_group, data = patients_sample)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "TMB_group="))
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis(option = "viridis",
discrete = TRUE)+
coord_cartesian(ylim = c(0,NA))+
labs(title = 'Survival Curve of the patients depending on their TMB',
x = 'Time (in months)',
y = 'Overall Survival Probability',
color = 'TMB severity', )+
theme_minimal()Higher TMB leads to higher survival.
We chose to focus our analysis on Glioma cancer:
Diving into Glioma
patients_sample_glioma <- patients_sample |>
filter(CANCER_TYPE == "Glioma")Survival Analysis stratified by Tumor Mutational Burden
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~TMB_group, data = patients_sample_glioma)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "TMB_group="))
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis_d(option = "viridis") +
coord_cartesian(ylim = c(0, NA))+
labs(title = 'Survival Curve of the patients depending on their TMB',
x = 'Time (in months)',
y = 'Overall Survival Probability',
color = 'TMB severity')+
theme_minimal()There is maybe not enough High TMB to make a concrete conclusion here, the survival curve of high TMB could be biased.
Lets look more into it, what is the distribution of TMB in Glioma?
Further analysis of Glioma cancer
#looking at the repartition of TMBs in the different sections
ggplot(patients_sample_glioma,
mapping = aes(x = TMB_group))+
geom_bar()+
labs(title = 'Repartition of TMB severity among patients with Glioma',
x ='TMB severity')+
theme_minimal()Does the TMB in glioma impact the choice f medication?
ggplot(patients_sample_glioma,
mapping = aes(x = (TMB_group),
fill = DRUG_TYPE))+
geom_bar()+
scale_fill_viridis_d(option = "viridis",
alpha = 0.7) +
labs(title = 'Drug given to Glioma patients depending on their TMB severity',
x = 'TMB severity',
fill = 'Drug Type')+
theme_minimal()Survival Analysis
For Glioma, some results showed that the gender had a greater impact on the survival than the age:
KM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~SEX, data = patients_sample_glioma)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "SEX="))
survival_glioma_gender <- ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis_d(option = "viridis") +
coord_cartesian(ylim = c(0, NA))+
labs(title = 'Survival Curve of the patients depending on their TMB',
x = 'Time (in months)',
y = 'Overall Survival Probability',
color = 'Gender')+
theme_minimal()
survival_glioma_genderKM_f <- survfit(Surv(OS_MONTHS, OS_STATUS) ~AGE_GROUP, data = patients_sample_glioma)
#convert to a tibble
tibble_KM_f <- tibble(
time = KM_f$time,
survival = KM_f$surv,
strata = rep(names(KM_f$strata),KM_f$strata))
tibble_KM_f <- tibble_KM_f %>%
mutate(strata = str_remove(strata, "AGE_GROUP="),
strata = factor(strata,
levels = c("<30", "31-50", "50-60", "61-70", ">71"),
ordered = TRUE
)
)
ggplot(tibble_KM_f,
mapping = aes(x = time,
y = survival,
color = strata))+
geom_step()+
scale_color_viridis_d(option = "viridis") +
coord_cartesian(ylim = c(0, NA))+
labs(title = 'Survival Curve of the patients depending on their TMB',
x = 'Time (in months)',
y = 'Overall Survival Probability',
color = 'Age group at diagnosis')+
theme_minimal()Saving Figures:
ggsave("../results/figures/survival_cancer_type.png", plot = survival_cancer_type)Saving 7 x 5 in image
ggsave("../results/figures/survival_glioma_gender.png", plot = survival_glioma_gender)Saving 7 x 5 in image
Mutation Types
Load libraries
library(tidyverse)
library(here)
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)Data wrangling
Load the mutations dataset and remove duplicate entries that only differ in the consequence column (they are not going to be studied). Select the columns of interest and rename them to SAMPLE_ID, GENE and Variant_Classification. Finally, group the less represented mutation types into broader categories to improve the final plots clarity.
#Load the mutations data
mutations <- read_tsv(here("data",
"data_mutation_cleaned.tsv"))Rows: 21383 Columns: 27
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (17): Hugo_Symbol, patient_id, Tumor_Sample_Barcode, Chromosome, Variant...
dbl (10): Entrez_Gene_Id, Start_Position, End_Position, t_ref_count, t_alt_c...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Remove the duplicates
mutations <- mutations |>
distinct(across(-consequence),
.keep_all = TRUE) |>
#Select the columns of interest
select(Hugo_Symbol,
Tumor_Sample_Barcode,
Variant_Classification) |>
#Modify the columns name
rename(SAMPLE_ID = Tumor_Sample_Barcode,
GENE = Hugo_Symbol) |>
#Group some mutation types to reduce the number of different categories
#Combine Frame shift del and ins into "Frame_Shift_Mutation"
mutate(Variant_Classification = case_when(
Variant_Classification %in% c(
"Frame_Shift_Del",
"Frame_Shift_Ins") ~ "Frame_Shift_Mutation",
TRUE ~ Variant_Classification)) |>
#Group the less represented mutations into "Other"
mutate(Variant_Classification = ifelse(
Variant_Classification %in% c("Frame_Shift_Mutation",
"Missense_Mutation",
"Nonsense_Mutation",
"Splice_Site"),
Variant_Classification,
"Other"
))
mutations# A tibble: 20,347 × 3
GENE SAMPLE_ID Variant_Classification
<chr> <chr> <chr>
1 PIK3CB P-0024731-T01-IM6 Missense_Mutation
2 TP53 P-0024731-T01-IM6 Missense_Mutation
3 TP53 P-0024731-T01-IM6 Missense_Mutation
4 PIK3C2G P-0024731-T01-IM6 Missense_Mutation
5 AXIN1 P-0024731-T01-IM6 Missense_Mutation
6 RECQL4 P-0024731-T01-IM6 Other
7 GLI1 P-0024731-T01-IM6 Frame_Shift_Mutation
8 BCL6 P-0024731-T01-IM6 Missense_Mutation
9 CASP8 P-0024731-T01-IM6 Missense_Mutation
10 CDKN2A P-0024731-T01-IM6 Missense_Mutation
# ℹ 20,337 more rows
Load the Clinical Sample data.
#Load the clinical sample data
clinical_sample <- read_tsv(here("data",
"patient_sample_gene_panel.tsv"))Rows: 1661 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (19): PATIENT_ID, SEX, AGE_GROUP, DRUG_TYPE, SAMPLE_ID, CANCER_TYPE, SAM...
dbl (6): OS_MONTHS, OS_STATUS, SEX_NUM, SAMPLE_COVERAGE, AGE_AT_SEQ_REPORT,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinical_sample# A tibble: 1,661 × 25
PATIENT_ID SEX OS_MONTHS OS_STATUS AGE_GROUP DRUG_TYPE SEX_NUM SAMPLE_ID
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr>
1 P-0000057 Female 0 1 31-50 PD-1/PDL-1 1 P-0000057…
2 P-0000062 Male 1 1 >71 PD-1/PDL-1 0 P-0000062…
3 P-0000063 Male 42 0 61-70 PD-1/PDL-1 0 P-0000063…
4 P-0000071 Male 43 0 61-70 PD-1/PDL-1 0 P-0000071…
5 P-0000082 Male 57 0 50-60 PD-1/PDL-1 0 P-0000082…
6 P-0000088 Male 12 1 61-70 PD-1/PDL-1 0 P-0000088…
7 P-0000103 Male 18 1 31-50 Combo 0 P-0000103…
8 P-0000121 Male 4 0 31-50 PD-1/PDL-1 0 P-0000121…
9 P-0000165 Female 1 1 61-70 PD-1/PDL-1 1 P-0000165…
10 P-0000184 Male 8 0 61-70 PD-1/PDL-1 0 P-0000184…
# ℹ 1,651 more rows
# ℹ 17 more variables: CANCER_TYPE <chr>, SAMPLE_TYPE <chr>,
# SAMPLE_CLASS <chr>, METASTATIC_SITE <chr>, PRIMARY_SITE <chr>,
# CANCER_TYPE_DETAILED <chr>, GENE_PANEL <chr>, SAMPLE_COVERAGE <dbl>,
# TUMOR_PURITY <chr>, ONCOTREE_CODE <chr>, INSTITUTE <chr>,
# SOMATIC_STATUS <chr>, AGE_AT_SEQ_REPORT <dbl>, TMB_NONSYNONYMOUS <dbl>,
# mutations <chr>, structural_variants <chr>, IMPACT_version <chr>
Create a subset to study the differences for each Glioma detailed cancer type.
#Create a subset for Glioma detailed cancer types
sample_glioma <- clinical_sample |>
filter(CANCER_TYPE=='Glioma') |>
select(SAMPLE_ID,
CANCER_TYPE,
CANCER_TYPE_DETAILED)
sample_glioma# A tibble: 117 × 3
SAMPLE_ID CANCER_TYPE CANCER_TYPE_DETAILED
<chr> <chr> <chr>
1 P-0000223-T01-IM3 Glioma Anaplastic Astrocytoma
2 P-0000749-T01-IM3 Glioma Anaplastic Astrocytoma
3 P-0000758-T01-IM3 Glioma Glioblastoma Multiforme
4 P-0000818-T01-IM3 Glioma Glioblastoma Multiforme
5 P-0001052-T01-IM3 Glioma Glioblastoma Multiforme
6 P-0001388-T01-IM3 Glioma Glioblastoma Multiforme
7 P-0001400-T01-IM3 Glioma Glioblastoma Multiforme
8 P-0001406-T01-IM3 Glioma Glioblastoma Multiforme
9 P-0001408-T01-IM3 Glioma Anaplastic Astrocytoma
10 P-0001420-T01-IM3 Glioma Anaplastic Oligodendroglioma
# ℹ 107 more rows
Select the columns of interest in the Clinical Sample dataset.
clinical_sample <- clinical_sample |>
#Select the columns of interest
select(SAMPLE_ID,
CANCER_TYPE)Merge the Clinical Sample dataset with the Mutations dataset. From the resulting object, create a subset for the Glioma subcategories.
#Merge the objects
clinical_sample_mutation <- left_join(mutations,
clinical_sample,
by = "SAMPLE_ID")
#Create a subset for glioma mutations
glioma_mutations <- clinical_sample_mutation |>
filter(CANCER_TYPE == "Glioma")
glioma_sample_mutation <- left_join(glioma_mutations,
sample_glioma,
by = "SAMPLE_ID")Count the frequency of each mutation across cancer types and within each glioma subcategory in the subset.
#Count the frequence of each mutation for each cancer type
mutations_count <- clinical_sample_mutation |>
group_by(CANCER_TYPE,
Variant_Classification) |>
summarise(count = n(),
.groups = "drop")#Count the frequence of each mutation for each detailed glioma type
glioma_mutations_count <- glioma_sample_mutation |>
group_by(CANCER_TYPE_DETAILED,
Variant_Classification) |>
summarise(count = n(),
.groups = "drop") |>
filter(!is.na(CANCER_TYPE_DETAILED))Data representation
mutation_by_cancer <- ggplot(data = mutations_count,
mapping = aes(x = CANCER_TYPE,
y = count,
fill = Variant_Classification)) +
geom_col(alpha = 0.8) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
size = 7),
panel.grid.major.x = element_blank()) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
labs(title = "Distribution of Mutation Type by Cancer Type",
x = "Cancer Type",
y = "Mutation Count",
fill = "Mutation Type")ggplot(data = glioma_mutations_count,
mapping = aes(x = CANCER_TYPE_DETAILED,
y = count,
fill = Variant_Classification)) +
geom_col(alpha = 0.8) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
size = 7),
panel.grid.major.x = element_blank()) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
labs(title = "Distribution of Mutation Type by Glioma Subcategory",
x = "Detailed Glioma Type",
y = "Mutation Count",
fill = "Mutation Type")Saving figures:
ggsave("../results/figures/mutation_by_cancer.png",
plot = mutation_by_cancer)Saving 7 x 5 in image
Subsetted Structural Variance Analysis
Import library
library(tidyverse)
library(ggplot2)
library(here)
library(viridis)Import data
As this study will later focus on a single gene, we also want to compare with the data from the mutations file
data_sv <- read_tsv(here("data", "sv_cleaned.tsv"))Rows: 346 Columns: 28
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (16): Sample_Id, Site1_Hugo_Symbol, Site1_Region, Site1_Chromosome, Site...
dbl (12): Site1_Region_Number, Site1_Position, Site2_Region_Number, Site2_Po...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_patient_sample <- read_tsv(here("data", "patient_sample_gene_panel.tsv"))Rows: 1661 Columns: 25
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (19): PATIENT_ID, SEX, AGE_GROUP, DRUG_TYPE, SAMPLE_ID, CANCER_TYPE, SAM...
dbl (6): OS_MONTHS, OS_STATUS, SEX_NUM, SAMPLE_COVERAGE, AGE_AT_SEQ_REPORT,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_mutations <- read_tsv(file = here('data','data_mutation_cleaned.tsv'))Rows: 21383 Columns: 27
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (17): Hugo_Symbol, patient_id, Tumor_Sample_Barcode, Chromosome, Variant...
dbl (10): Entrez_Gene_Id, Start_Position, End_Position, t_ref_count, t_alt_c...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data augmentation
As already stated in the SV class analysis, clinical and SV data have different dimensions. The same reasoning applies with the mutations file, where several mutations have been sequenced for each patient.
sv_clinic_augm <- data_sv |> left_join(data_patient_sample, by = c("Sample_Id" = "SAMPLE_ID"))
mutations_augm_pat <- data_mutations |>
left_join(data_patient_sample, by = c("Tumor_Sample_Barcode" = "SAMPLE_ID"))Data analysis
Is a gene recurrent among SVs?
To assess whether a gene is overrepresented among SVs, we analyze two distinct locations: the beginning site and the ending site of the SV. The genes associated with these sites are provided by the Site1_Hugo_Symbol and Site2_Hugo_Symbol attributes, respectively. While we can examine these sites separately, it is more robust to either sum or take the union of the two associated gene sets. By summing the events, we consider each impact on a gene and quantify the frequency of these impacts. Alternatively, taking the union allows us to count only the unique SVs in which each specific gene is involved at least once.
First, summing all counts:
genes_longer <- sv_clinic_augm |>
pivot_longer(cols = c("Site1_Hugo_Symbol", "Site2_Hugo_Symbol"), names_to = "Position", values_to = "Gene_Name")|>
filter(!is.na(Gene_Name))
genes_longer |>
count(Gene_Name, sort = TRUE) |>
head(10)# A tibble: 10 × 2
Gene_Name n
<chr> <int>
1 EGFR 59
2 BRAF 13
3 TP53 13
4 APC 11
5 RB1 10
6 ROS1 10
7 CDKN2A 8
8 SMARCA4 8
9 ALK 7
10 FGFR3 7
Then only taking the union:
genes_longer_distinct <- sv_clinic_augm |>
pivot_longer(cols = c("Site1_Hugo_Symbol", "Site2_Hugo_Symbol"), names_to = "Position", values_to = "Gene_Name")|>
filter(!is.na(Gene_Name)) |>
distinct(Sample_Id, Gene_Name)
genes_longer_distinct |>
count(Gene_Name, sort = TRUE) |>
head(10)# A tibble: 10 × 2
Gene_Name n
<chr> <int>
1 EGFR 29
2 BRAF 10
3 ROS1 9
4 APC 8
5 FGFR3 7
6 TP53 7
7 ALK 6
8 RB1 6
9 TACC3 6
10 CDKN2A 5
Also, we can look at each site separately:
sv_clinic_augm |>
count(Site1_Hugo_Symbol, sort = TRUE) |>
head(10)# A tibble: 10 × 2
Site1_Hugo_Symbol n
<chr> <int>
1 EGFR 30
2 ROS1 7
3 BRAF 6
4 EWSR1 6
5 TP53 6
6 ALK 5
7 APC 5
8 CDKN2A 5
9 ETV6 5
10 SMARCA4 5
sv_clinic_augm |>
count(Site2_Hugo_Symbol, sort = TRUE) |>
head(10)# A tibble: 10 × 2
Site2_Hugo_Symbol n
<chr> <int>
1 EGFR 29
2 BRAF 7
3 TP53 7
4 APC 6
5 RB1 6
6 FGFR3 5
7 CD74 4
8 NOTCH1 4
9 CDKN2A 3
10 CDKN2BAS 3
EGFR is over represented, whatever the counting method used.
top_genes <- genes_longer |>
count(Gene_Name, sort = TRUE) |>
head(10)
SVs_per_gene <- top_genes |>
mutate(Gene_Name = factor(Gene_Name, levels = rev(Gene_Name))) |>
ggplot(aes(x = Gene_Name, y = n, fill=Gene_Name)) +
geom_text(aes(label = n), hjust = -0.5, size = 4) +
geom_col(color = "black", alpha = 0.9, linewidth = 0.3) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() +
coord_flip() +
theme(panel.grid.major.y = element_blank(), legend.position = "none")+
labs(
title = "Top 10 Genes Impacted by Structural Variant Events",
subtitle = "EGFR is the most frequent hit",
x = NULL,
y = "Total SV Events (Site 1 + Site 2)"
)
SVs_per_geneThe gene EGFR is indeed involved in 59 sites over 692 sites for the 346 SVs. Let’s have a closer look at this. Here, we are reasoning per SV, and only consider the SVs where EGFR is involved either on at least one of the 2 sites.
EGFR_SV <- sv_clinic_augm |>
filter(Site1_Hugo_Symbol == "EGFR" | Site2_Hugo_Symbol == "EGFR")
EGFR_SV |>
ggplot(aes(x = Class, fill = Class)) +
geom_bar(color = "black", alpha = 0.9, linewidth = 0.3) +
theme_minimal() +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme(legend.position = "none", panel.grid.major.x = element_blank())EGFR is involved in 30 SVs, and among these 25 are deletion! Let’s see if a type of cancer is predominant within the EGFR-involved SVs.
EGFR_per_cancer <- EGFR_SV |>
ggplot(aes(x = CANCER_TYPE, fill = Class)) +
geom_bar(color = "black", alpha = 0.8, linewidth = 0.3) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() +
theme(panel.grid.major.x = element_blank()) +
labs(
title = "Frequency of EGFR SV Events by Cancer Type",
subtitle = "Decomposition of unique EGFR SVs by variant class and cancer type.",
x = NULL,
y = "Total SVs Implicating EGFR",
fill = "SV Class"
)
EGFR_per_cancerAlmost all EGFR SVs are from glioma samples. But are there SVs impacting other gene in glioma ?
glioma_SV <- sv_clinic_augm |>
filter(CANCER_TYPE == "Glioma")
glioma_SV_longer <- sv_clinic_augm |>
filter(CANCER_TYPE == "Glioma") |>
pivot_longer(cols = c("Site1_Hugo_Symbol", "Site2_Hugo_Symbol"), names_to = "Position", values_to = "Gene_Name")|>
filter(!is.na(Gene_Name))
glioma_SV |>
ggplot(aes(x = Class, fill = Class)) +
geom_bar(color = "black", alpha = 0.8) +
theme_minimal() +
theme(legend.position = "none")We will look at the genes from Site 1 and Site 2 separately
glioma_SV |>
ggplot(aes(x = Site1_Hugo_Symbol, fill = Site1_Hugo_Symbol)) +
geom_bar(color = "black", alpha = 0.8) +
coord_flip() +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() glioma_SV |>
ggplot(aes(x = Site2_Hugo_Symbol, fill = Site2_Hugo_Symbol)) +
geom_bar(color = "black", alpha = 0.8) +
coord_flip() +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() EGFR is overrepresented in both cancer types, confirming it as a major SV driver in glioma. We will therefore continue to aggregate both impact sites (Site 1 and Site 2) to account for the total event frequency.
other_genes_levels <- glioma_SV_longer |>
filter(Gene_Name != "EGFR") |>
pull(Gene_Name) |>
unique() |>
sort()
new_levels <- c("EGFR", other_genes_levels) # for EGFR to have the darkest color of the viridis palette
SV_genes_in_glioma <- glioma_SV_longer |>
mutate(Group = ifelse(Gene_Name == "EGFR", "EGFR", "Other_genes"), Gene_Name = factor(Gene_Name, levels = new_levels)) |>
ggplot(aes(x = Group, fill = Gene_Name)) +
geom_bar(color = "black", width = 0.6, linewidth = 0.25, alpha = 0.9) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
theme_minimal() +
labs(
title = "SV Event Frequency: EGFR vs. Other Genes in Glioma",
subtitle = "The 'Other_genes' bar is decomposed by individual gene.",
x = NULL,
y = "Total SV Impact Events"
) +
theme(legend.position = "none", panel.grid.major.x = element_blank())
SV_genes_in_gliomaStudy of mutations for glioma samples
Based on the previous findings, it would be interesting to look at the genes most impacted by pinpoint mutations in glioma. Does EGFR represent a large share of these genes?
data_mutations <- read_tsv(file = here('data','data_mutation_cleaned.tsv'))Rows: 21383 Columns: 27
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (17): Hugo_Symbol, patient_id, Tumor_Sample_Barcode, Chromosome, Variant...
dbl (10): Entrez_Gene_Id, Start_Position, End_Position, t_ref_count, t_alt_c...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mutations_augm_pat <- data_mutations |>
left_join(data_patient_sample, by = c("Tumor_Sample_Barcode" = "SAMPLE_ID"))
glioma_mut <- mutations_augm_pat |>
filter(CANCER_TYPE == "Glioma")top_genes_mut <- glioma_mut |>
count(Hugo_Symbol, sort = TRUE) |>
slice_head(n = 10)
top_genes_mut# A tibble: 10 × 2
Hugo_Symbol n
<chr> <int>
1 TERT 87
2 TP53 55
3 PTEN 38
4 NF1 36
5 ATRX 28
6 EGFR 26
7 IDH1 20
8 KMT2D 15
9 PDGFRA 13
10 PIK3R1 13
EGFR is not the first gene impacted by mutations, but it still is in 6th position.
# Highlighting the EGFR column in red
genes_list <- unique(top_genes_mut$Hugo_Symbol)
custom_colors <- rep("black", length(genes_list))
names(custom_colors) <- genes_list
custom_colors["EGFR"] <- "red"
Mutation_count_in_glioma <- top_genes_mut |>
mutate(Hugo_Symbol = fct_reorder(Hugo_Symbol, n)) |>
ggplot(aes(x = Hugo_Symbol, y = n, fill=Hugo_Symbol, color = Hugo_Symbol)) +
geom_text(aes(label = n), hjust = -0.5, size = 4) +
geom_col(color = custom_colors, alpha = 0.9, linewidth = 0.3) +
scale_fill_viridis(option = "viridis",
discrete = TRUE) +
scale_color_manual(values = custom_colors) +
theme_minimal() +
coord_flip() +
theme(panel.grid.major.y = element_blank(), legend.position = "none")+
labs(
title = "Top 10 genes in Glioma Impacted by Mutations",
subtitle = "EGFR is in 6th position",
x = NULL,
y = "Total number of mutations"
)
Mutation_count_in_gliomaIt could be interesting to compare the patients having a glioma, to see if some glioma are enriched in EGFR SV, or others in mutations from the top 5 above (TERT TP53 PTEN NF1 ATRX).
Exporting plots
ggsave(here('results','figures', 'SV_count_per_gene.png'), SVs_per_gene)Saving 7 x 5 in image
ggsave(here('results','figures', 'EGFR_count_per_cancer.png'), EGFR_per_cancer)Saving 7 x 5 in image
ggsave(here('results','figures', 'SV_gene_in_glioma.png'), SV_genes_in_glioma)Saving 7 x 5 in image
ggsave(here('results','figures', 'Mutation_count_in_glioma.png'), Mutation_count_in_glioma)Saving 7 x 5 in image
Cox Regression
Library import
library(here)
library(tidyverse)
library(ggplot2)
library(survival)
library(broom)
library(survminer)
library(viridis)Data import
patient_sample <- read_tsv(file = here('data','patient_sample_gene_panel.tsv'))
mutations <- read_tsv(file = here('data','data_mutation_cleaned.tsv'))Data wrangling
Need sex, age, cancer type to be transformed to numerical categories for the statistical analysis.
df <- patient_sample |>
mutate(TUMOR_PURITY = as.numeric(TUMOR_PURITY),
CANCER_TYPE = as.factor(CANCER_TYPE)) |>
drop_na()Warning: There was 1 warning in `mutate()`.
ℹ In argument: `TUMOR_PURITY = as.numeric(TUMOR_PURITY)`.
Caused by warning:
! NAs introduced by coercion
Cox regression
This part is about the Cox regression for hazard analysis and finding some relevant variables about the survival rate of individuals in the cohort.
cancer_types <- df |>
select(CANCER_TYPE) |>
drop_na() |>
distinct()
print("The different cancer types are:")[1] "The different cancer types are:"
cancer_types# A tibble: 11 × 1
CANCER_TYPE
<fct>
1 Breast Cancer
2 Esophagogastric Cancer
3 Bladder Cancer
4 Non-Small Cell Lung Cancer
5 Glioma
6 Head and Neck Cancer
7 Cancer of Unknown Primary
8 Renal Cell Carcinoma
9 Melanoma
10 Colorectal Cancer
11 Skin Cancer, Non-Melanoma
Regression on all cancer types
First, we check if the cancer type has influence over the survival rate.
cox_model_baseline_glob <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = df)
cox_model_cancer_types_glob <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ CANCER_TYPE, data = df) Warning in coxph.fit(X, Y, istrat, offset, init, control, weights = weights, :
Loglik converged before variable 10 ; coefficient may be infinite.
anova(cox_model_baseline_glob,cox_model_cancer_types_glob)Analysis of Deviance Table
Cox model: response is Surv(OS_MONTHS, OS_STATUS)
Model 1: ~ 1
Model 2: ~ CANCER_TYPE
loglik Chisq Df Pr(>|Chi|)
1 -5340.2
2 -5255.5 169.34 10 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We see that the log-likelihood ratio is affected by the cancer type in a significant way as the p-value is of 2.2e-16. We will now check how which cancer affect it specifically.
summary(cox_model_cancer_types_glob)Call:
coxph(formula = Surv(OS_MONTHS, OS_STATUS) ~ CANCER_TYPE, data = df)
n= 1595, number of events= 793
coef exp(coef) se(coef) z
CANCER_TYPEBreast Cancer 6.446e-01 1.905e+00 2.091e-01 3.082
CANCER_TYPECancer of Unknown Primary 4.070e-01 1.502e+00 2.000e-01 2.035
CANCER_TYPEColorectal Cancer -7.477e-02 9.280e-01 1.873e-01 -0.399
CANCER_TYPEEsophagogastric Cancer 3.927e-01 1.481e+00 1.718e-01 2.285
CANCER_TYPEGlioma 4.603e-01 1.585e+00 1.528e-01 3.012
CANCER_TYPEHead and Neck Cancer 4.231e-01 1.527e+00 1.552e-01 2.726
CANCER_TYPEMelanoma -6.704e-01 5.115e-01 1.427e-01 -4.696
CANCER_TYPENon-Small Cell Lung Cancer 3.599e-01 1.433e+00 1.269e-01 2.837
CANCER_TYPERenal Cell Carcinoma -7.313e-01 4.813e-01 1.709e-01 -4.280
CANCER_TYPESkin Cancer, Non-Melanoma -1.100e+01 1.670e-05 7.533e+02 -0.015
Pr(>|z|)
CANCER_TYPEBreast Cancer 0.00205 **
CANCER_TYPECancer of Unknown Primary 0.04181 *
CANCER_TYPEColorectal Cancer 0.68977
CANCER_TYPEEsophagogastric Cancer 0.02229 *
CANCER_TYPEGlioma 0.00259 **
CANCER_TYPEHead and Neck Cancer 0.00641 **
CANCER_TYPEMelanoma 2.65e-06 ***
CANCER_TYPENon-Small Cell Lung Cancer 0.00455 **
CANCER_TYPERenal Cell Carcinoma 1.87e-05 ***
CANCER_TYPESkin Cancer, Non-Melanoma 0.98835
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
CANCER_TYPEBreast Cancer 1.9052581 5.249e-01 1.2646 2.8705
CANCER_TYPECancer of Unknown Primary 1.5023036 6.656e-01 1.0152 2.2231
CANCER_TYPEColorectal Cancer 0.9279563 1.078e+00 0.6428 1.3396
CANCER_TYPEEsophagogastric Cancer 1.4809595 6.752e-01 1.0575 2.0740
CANCER_TYPEGlioma 1.5845056 6.311e-01 1.1744 2.1378
CANCER_TYPEHead and Neck Cancer 1.5266170 6.550e-01 1.1262 2.0694
CANCER_TYPEMelanoma 0.5115208 1.955e+00 0.3867 0.6767
CANCER_TYPENon-Small Cell Lung Cancer 1.4331997 6.977e-01 1.1177 1.8377
CANCER_TYPERenal Cell Carcinoma 0.4812732 2.078e+00 0.3443 0.6727
CANCER_TYPESkin Cancer, Non-Melanoma 0.0000167 5.987e+04 0.0000 Inf
Concordance= 0.622 (se = 0.01 )
Likelihood ratio test= 169.3 on 10 df, p=<2e-16
Wald test = 151.9 on 10 df, p=<2e-16
Score (logrank) test = 164.2 on 10 df, p=<2e-16
Here the column “exp(coeff)” displays how hazardous a cancer type will be compared to the reference one. We can observe that the “bladder cancer” isn’t in the table above, meaning that the cox function chose the “Bladder Cancer” as the reference one, so all the hazards are compared to this one. For instance, an exp(coeff) of 2 means the individual is 2 times more likely to die than the an individual having “Bladder cancer”.
So all the cancer seems to be significant, appart from the “Colorectal cancer” and the “Skin Cancer, Non-Melanoma”, as they have high p-value. We will plot the count of living and deceased people by gender, stratified by cancer type, to see how the distributions change.
df_gender <- df |>
mutate(
Sex = factor(SEX),
Survival_Status = if_else(
OS_STATUS == 0,
"Living", # OS_status = 0
"Deceased" # OS_status = 1
),
Age_group = factor(AGE_GROUP,
levels = c(
"<30",
"31-50",
"50-60",
"61-70",
">71")
)
)
surv_count_per_gender_per_cancer <- df_gender |>
ggplot(
aes(x = Survival_Status,
fill = Sex
)
) +
geom_bar(position = position_dodge(width = 0.8), linewidth = 0.8) +
scale_fill_viridis(option = "viridis",
discrete = TRUE,
begin = 0.5
)+
facet_wrap(~CANCER_TYPE) +
labs(title = "Influence of the gender on the survival states, stratified by cancer type") +
theme_classic()
surv_count_per_gender_per_cancerWe see that there is almost no people with “Skin Cancer, Non-Melanoma” in the cohort, probably explaining why it was not significant at all. Otherwise, we can see that these plots are coherent with the cox regression on cancer types.
Now lets check if the age_group and the gender have an impact on the survival.
cox_model_baseline_glob <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = df_gender)
cox_model_age_gender_glob <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ Age_group + Sex, data = df_gender)
anova(cox_model_baseline_glob,cox_model_age_gender_glob)Analysis of Deviance Table
Cox model: response is Surv(OS_MONTHS, OS_STATUS)
Model 1: ~ 1
Model 2: ~ Age_group + Sex
loglik Chisq Df Pr(>|Chi|)
1 -5340.2
2 -5335.8 8.7183 5 0.1208
summary(cox_model_age_gender_glob)Call:
coxph(formula = Surv(OS_MONTHS, OS_STATUS) ~ Age_group + Sex,
data = df_gender)
n= 1595, number of events= 793
coef exp(coef) se(coef) z Pr(>|z|)
Age_group31-50 -0.05717 0.94443 0.21240 -0.269 0.788
Age_group50-60 -0.29246 0.74642 0.20936 -1.397 0.162
Age_group61-70 -0.18250 0.83319 0.20687 -0.882 0.378
Age_group>71 -0.20525 0.81444 0.20940 -0.980 0.327
SexMale -0.12275 0.88448 0.07308 -1.680 0.093 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
Age_group31-50 0.9444 1.059 0.6228 1.432
Age_group50-60 0.7464 1.340 0.4952 1.125
Age_group61-70 0.8332 1.200 0.5555 1.250
Age_group>71 0.8144 1.228 0.5403 1.228
SexMale 0.8845 1.131 0.7664 1.021
Concordance= 0.537 (se = 0.011 )
Likelihood ratio test= 8.72 on 5 df, p=0.1
Wald test = 8.85 on 5 df, p=0.1
Score (logrank) test = 8.87 on 5 df, p=0.1
We see from the p values that none of the age_group is significant, whereas the sex is close to be significant (p-value of 0.09).
surv_count_gender <- df_gender |>
ggplot(
aes(x = Survival_Status,
fill = Sex
)
) +
geom_bar(position = position_dodge(width = 0.8), linewidth = 0.8) +
scale_fill_viridis(option = "viridis",
discrete = TRUE,
begin = 0.5
)+
facet_wrap(~Age_group) +
labs(title = "Influence of the gender on the survival states, stratified by age group") +
theme_classic()
surv_count_genderIndeed we can see on the above plot that the distribution is similar for all age groups, and that gender seems to impact as the distributions are different for male and female. Lets check in detail this difference.
cox_model_gender_glob <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ Sex, data = df_gender)
summary(cox_model_gender_glob)Call:
coxph(formula = Surv(OS_MONTHS, OS_STATUS) ~ Sex, data = df_gender)
n= 1595, number of events= 793
coef exp(coef) se(coef) z Pr(>|z|)
SexMale -0.13113 0.87710 0.07296 -1.797 0.0723 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
SexMale 0.8771 1.14 0.7602 1.012
Concordance= 0.521 (se = 0.01 )
Likelihood ratio test= 3.2 on 1 df, p=0.07
Wald test = 3.23 on 1 df, p=0.07
Score (logrank) test = 3.24 on 1 df, p=0.07
Male seems to be more likely to survive than females.
df_gender |>
ggplot(
aes(x = OS_MONTHS)
) +
geom_density(aes(fill = Sex), alpha = 0.5) +
scale_fill_viridis(option = "viridis",
discrete = TRUE
) +
labs(title = "Distribution of the survival_months by gender") +
theme_classic()df_gender |>
ggplot(
aes(x = Sex,
y = OS_MONTHS,
fill = OS_STATUS
)
) +
geom_boxplot(position = position_dodge(width = 0.8), linewidth = 0.8, alpha = 0.5) +
scale_fill_viridis(option = "viridis",
discrete = TRUE
)+
facet_wrap(~Age_group) +
labs(title = "Influence of the gender on the survival months, stratified by age group") +
theme_classic() Warning: The following aesthetics were dropped during statistical transformation: fill.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
The following aesthetics were dropped during statistical transformation: fill.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
We can even see that in average, Male have a higher Survival_Months than Female.
newdata <- data.frame(
Age_group = factor("50-60", levels = levels(df_gender$Age_group)),
Sex = factor(c("Female", "Male"), levels = levels(as.factor(df_gender$Sex)))
)
surv_fit_gender_glob <- survfit(cox_model_gender_glob, newdata = newdata)surv_plot_gender <- ggsurvplot(
surv_fit_gender_glob,
data = newdata,
legend.labs = c("Female", "Male"),
conf.int = TRUE,
pval = TRUE,
risk.table = FALSE,
ggtheme = theme_minimal(),
title = "Adjusted Survival Curves by Gender",
xlab = "Time (Months)",
ylab = "Survival Probability",
legend.title = "Gender"
)Warning in .pvalue(fit, data = data, method = method, pval = pval, pval.coord = pval.coord, : There are no survival curves to be compared.
This is a null model.
surv_plot_gender$plotThe predictor is in accordance with the cox regression: a male has a higher probability of survival than a woman at anytime.
Regression on Glioma specifically
Now, we will focus on Glioma.
df_glioma <- df_gender |>
filter(CANCER_TYPE == "Glioma")
cox_model_glioma_baseline <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ 1, data = df_glioma)
cox_model_glioma_tumor_purity <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ TUMOR_PURITY, data = df_glioma)
anova(cox_model_glioma_baseline,cox_model_glioma_tumor_purity)Analysis of Deviance Table
Cox model: response is Surv(OS_MONTHS, OS_STATUS)
Model 1: ~ 1
Model 2: ~ TUMOR_PURITY
loglik Chisq Df Pr(>|Chi|)
1 -329.03
2 -329.03 3e-04 1 0.9865
We see that the log-likelihood ratio are the same, thus the tumor purity doesn’t affect the survival rate for a patient having Glioma.
df_glioma |>
ggplot(
aes(x = TUMOR_PURITY,
y = OS_MONTHS
)
) +
geom_point() +
facet_wrap(~as.factor(OS_STATUS))+
labs(title = "Influence of the tumor purity on the survival months stratified
by the survival state") +
theme_classic()And indeed, we can’t see any trend on this graph, confirming that the tumor purity doesn’t impact the survival rate. What about the Age group and the sex of the patient?
cox_model_glioma_age_gender <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ Age_group + Sex, data = df_glioma)
anova(cox_model_glioma_baseline,cox_model_glioma_age_gender)Analysis of Deviance Table
Cox model: response is Surv(OS_MONTHS, OS_STATUS)
Model 1: ~ 1
Model 2: ~ Age_group + Sex
loglik Chisq Df Pr(>|Chi|)
1 -329.03
2 -324.04 9.9832 5 0.07571 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value dropped a lot here, it’s still not significant but it’s close to significance (p-value of 0.075), let’s check which of age_group and gender leads the trend here.
summary(cox_model_glioma_age_gender)Call:
coxph(formula = Surv(OS_MONTHS, OS_STATUS) ~ Age_group + Sex,
data = df_glioma)
n= 113, number of events= 83
coef exp(coef) se(coef) z Pr(>|z|)
Age_group31-50 -0.1898 0.8272 0.3906 -0.486 0.6271
Age_group50-60 -0.4928 0.6109 0.3890 -1.267 0.2052
Age_group61-70 -0.7333 0.4803 0.4310 -1.702 0.0888 .
Age_group>71 -0.3029 0.7387 0.5393 -0.562 0.5744
SexMale -0.5830 0.5582 0.2363 -2.467 0.0136 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
Age_group31-50 0.8272 1.209 0.3847 1.7787
Age_group50-60 0.6109 1.637 0.2850 1.3095
Age_group61-70 0.4803 2.082 0.2064 1.1178
Age_group>71 0.7387 1.354 0.2567 2.1259
SexMale 0.5582 1.791 0.3513 0.8871
Concordance= 0.626 (se = 0.032 )
Likelihood ratio test= 9.98 on 5 df, p=0.08
Wald test = 10.12 on 5 df, p=0.07
Score (logrank) test = 10.39 on 5 df, p=0.06
Once again the only p-value below 0.05 here is for the gender, thus even though we would think age_group might be relevant, the only significant part is the gender for patient who have a Glioma!
surv_count_gender_glioma <- df_glioma |>
ggplot(
aes(x = OS_STATUS,
fill = Sex
)
) +
geom_bar(position = position_dodge(width = 0.8), linewidth = 0.8) +
scale_fill_viridis(option = "viridis",
discrete = TRUE,
begin = 0.5
)+
facet_wrap(~Age_group) +
labs(title = "Influence of the gender on the survival states, stratified by age group") +
theme_classic()
surv_count_gender_gliomaAs for the study with all the cancer types, the distributions seem indeed to be the same for all age groups. Lets check if the gender influence is the same as before, i.e. that male have higher probability to survive.
cox_model_glioma_gender <- coxph(Surv(OS_MONTHS, OS_STATUS) ~ Sex, data = df_glioma)
summary(cox_model_glioma_gender)Call:
coxph(formula = Surv(OS_MONTHS, OS_STATUS) ~ Sex, data = df_glioma)
n= 113, number of events= 83
coef exp(coef) se(coef) z Pr(>|z|)
SexMale -0.5683 0.5665 0.2319 -2.451 0.0143 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
SexMale 0.5665 1.765 0.3596 0.8924
Concordance= 0.571 (se = 0.029 )
Likelihood ratio test= 5.66 on 1 df, p=0.02
Wald test = 6.01 on 1 df, p=0.01
Score (logrank) test = 6.16 on 1 df, p=0.01
In the case of Glioma, Males seems to be 2 times more likely to survive than Females!
surv_fit_glioma_gender <- survfit(cox_model_glioma_gender, newdata = newdata)surv_plot_glioma_gender <- ggsurvplot(
surv_fit_glioma_gender,
data = newdata,
legend.labs = c("Female", "Male"),
conf.int = TRUE,
pval = TRUE,
risk.table = FALSE,
ggtheme = theme_minimal(),
title = "Adjusted Survival Curves by Gender",
xlab = "Time (Months)",
ylab = "Survival Probability",
legend.title = "Gender"
)Warning in .pvalue(fit, data = data, method = method, pval = pval, pval.coord = pval.coord, : There are no survival curves to be compared.
This is a null model.
surv_plot_glioma_gender$plotAnd indeed we see a bigger gap between the survival probabilities for Male and Female in case of Glioma.
Discussions
Let’s take a step back and talk about the age_group influence. It’s a surprising results that age_group aren’t significant, especially when you take into account that gliomas aren’t curable for child, so this result might be biased by the sample size.
n_sample <- df_glioma |>
nrow()
sprintf("There are %.i samples of Glioma in our dataset.", n_sample)[1] "There are 113 samples of Glioma in our dataset."
df_glioma |>
ggplot(
aes(x= Age_group,
fill = Sex
)
) +
geom_bar() +
scale_fill_viridis(option = "viridis",
discrete = TRUE
)+
labs(title = "Count of the age_group stratified by gender for the Glioma sample") +
theme_classic()So we can see two things: first we have very few samples for the age groups before 30 years old and after 71 years old, second there are more male than female. This two differences combined with the few number of samples are very likely to bias our study.
Exporting the plots
if(!dir.exists(here("results"))) dir.create(here("results"))
if(!dir.exists(here("results",'figures'))) dir.create(here("results", 'figures'))
ggsave(here('results','figures', 'surv_count_per_gender_per_cancer.png'), surv_count_per_gender_per_cancer)Saving 7 x 5 in image
ggsave(here('results','figures', 'surv_count_per_gender.png'), surv_count_gender)Saving 7 x 5 in image
ggsave(here('results','figures', 'surv_count_per_gender_for_glioma.png'), surv_count_gender_glioma)Saving 7 x 5 in image
ggsave(here('results','figures', 'surv_plot_per_gender.png'), surv_plot_gender$plot)Saving 7 x 5 in image
ggsave(here('results','figures', 'surv_plot_per_gender_for_glioma.png'), surv_plot_glioma_gender$plot)Saving 7 x 5 in image
Gene Set Enrichment Analysis
Load Libraries
library(tidyverse)
library(here)
library(fgsea)
library(msigdbr)Load data
Gene Ontology Information
This loads a dataset where genes with their gene symbol and eEnsemblID are mapped to biological processes. With this information we can see if their are certain processes which are more often targeted by mutations than is expected by chance.
BP_df = msigdbr(species = "human", category = "C5", subcategory = "BP")
BP_list = split(BP_df$ensembl_gene, BP_df$gs_name)Cleaned data
patient_sample <- read_tsv(file = here("data","patient_sample_gene_panel.tsv"))
sv <- read_tsv(file = here("data","sv_cleaned.tsv"))
mutations <- read_tsv(file = here("data","data_mutation_cleaned.tsv"))GSEA with mutations per Gene
The Genes are ranked on the number of mutations or structural variances that have been identified in them.
SV
Gene Set Enrichment analysis is first applied on to the mutations which cause structural variances.
Data format: Genes + number of SVs
clinical_clean <- patient_sample |> select(c(SAMPLE_ID, CANCER_TYPE, SAMPLE_TYPE, TMB_NONSYNONYMOUS))
gene_sv_count <- sv |>
left_join(clinical_clean, by = c("Sample_Id" = "SAMPLE_ID")) |>
filter(CANCER_TYPE == "Glioma")|>
pivot_longer(cols = c("Site1_Hugo_Symbol", "Site2_Hugo_Symbol"), names_to = "Position", values_to = "Gene_Name")|>
filter(!is.na(Gene_Name))|>
count(Gene_Name, sort = TRUE)|>
left_join(BP_df |> select(gene_symbol, ensembl_gene), by = c("Gene_Name" = "gene_symbol"))|>
distinct()|>
na.omit()
gene_sv_stats <- gene_sv_count$n
names(gene_sv_stats) = gene_sv_count$ensembl_gene
length(gene_sv_stats)[1] 28
pathway_scoring_gene_sv <- fgseaMultilevel(pathways= BP_list, stats = gene_sv_stats,minSize = 0, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (89.29% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
Warning in fgseaMultilevel(pathways = BP_list, stats = gene_sv_stats, minSize =
0, : For some of the pathways the P-values were likely overestimated. For such
pathways log2err is set to NA.
pathway_scoring_gene_sv <- arrange(pathway_scoring_gene_sv, pval)
head(pathway_scoring_gene_sv |> select(pathway, pval, padj), n = 15) pathway
<char>
1: GOBP_POSITIVE_REGULATION_OF_INTRACELLULAR_SIGNAL_TRANSDUCTION
2: GOBP_POSITIVE_REGULATION_OF_SIGNALING
3: GOBP_POSITIVE_REGULATION_OF_PHOSPHORUS_METABOLIC_PROCESS
4: GOBP_REGULATION_OF_PHOSPHORUS_METABOLIC_PROCESS
5: GOBP_POSITIVE_REGULATION_OF_CELLULAR_BIOSYNTHETIC_PROCESS
6: GOBP_POSITIVE_REGULATION_OF_NUCLEOBASE_CONTAINING_COMPOUND_METABOLIC_PROCESS
7: GOBP_POSITIVE_REGULATION_OF_PROTEIN_METABOLIC_PROCESS
8: GOBP_POSITIVE_REGULATION_OF_PROTEIN_MODIFICATION_PROCESS
9: GOBP_POSITIVE_REGULATION_OF_RNA_METABOLIC_PROCESS
10: GOBP_REGULATION_OF_INTRACELLULAR_SIGNAL_TRANSDUCTION
11: GOBP_REGULATION_OF_PROTEIN_MODIFICATION_PROCESS
12: GOBP_CALCIUM_MEDIATED_SIGNALING
13: GOBP_POSITIVE_REGULATION_OF_TRANSCRIPTION_BY_RNA_POLYMERASE_II
14: GOBP_SECOND_MESSENGER_MEDIATED_SIGNALING
15: GOBP_AMEBOIDAL_TYPE_CELL_MIGRATION
pval padj
<num> <num>
1: 0.01115413 0.704562
2: 0.01115413 0.704562
3: 0.01248514 0.704562
4: 0.02049273 0.704562
5: 0.02200742 0.704562
6: 0.02200742 0.704562
7: 0.02200742 0.704562
8: 0.02200742 0.704562
9: 0.02200742 0.704562
10: 0.03104316 0.704562
11: 0.04129933 0.704562
12: 0.04821517 0.704562
13: 0.04821517 0.704562
14: 0.04821517 0.704562
15: 0.05392010 0.704562
Looking at the pvalue and adjusted p-value these results are not very robust. This is probably due to the small sample size of only 28 genes.
Mutations
The same analysis, but now with the number of mutations
gene_mutation_count <- mutations |>
left_join(clinical_clean, by = c("Tumor_Sample_Barcode" = "SAMPLE_ID"))|>
filter(CANCER_TYPE == "Glioma")|>
count(Hugo_Symbol, sort = TRUE)|>
left_join(BP_df |> select(gene_symbol, ensembl_gene), by = c("Hugo_Symbol" = "gene_symbol"))|>
distinct()|>
na.omit()
gene_mutation_stats <- gene_mutation_count$n
names(gene_mutation_stats) = gene_mutation_count$ensembl_genepathway_scoring_gene_m <- fgseaMultilevel(pathways= BP_list, stats = gene_mutation_stats,minSize = 15, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (92.59% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
pathway_scoring_gene_m <- arrange(pathway_scoring_gene_m, padj)
head(pathway_scoring_gene_m|> select(pathway, pval, padj), n = 15) pathway pval
<char> <num>
1: GOBP_CIRCULATORY_SYSTEM_DEVELOPMENT 7.790565e-07
2: GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS 1.343345e-06
3: GOBP_REGULATION_OF_CELL_CYCLE_G1_S_PHASE_TRANSITION 6.312870e-06
4: GOBP_REGULATION_OF_MITOTIC_CELL_CYCLE_PHASE_TRANSITION 5.853040e-06
5: GOBP_TUBE_DEVELOPMENT 6.186984e-06
6: GOBP_BLOOD_VESSEL_MORPHOGENESIS 8.692440e-06
7: GOBP_CELL_AGING 1.258465e-05
8: GOBP_CELL_CYCLE_G1_S_PHASE_TRANSITION 1.371291e-05
9: GOBP_REGULATION_OF_APOPTOTIC_SIGNALING_PATHWAY 1.179932e-05
10: GOBP_REGULATION_OF_EXTRINSIC_APOPTOTIC_SIGNALING_PATHWAY 1.108392e-05
11: GOBP_MITOTIC_CELL_CYCLE_PHASE_TRANSITION 1.944686e-05
12: GOBP_REGULATION_OF_CELLULAR_LOCALIZATION 2.252473e-05
13: GOBP_VASCULATURE_DEVELOPMENT 3.792307e-05
14: GOBP_NEGATIVE_REGULATION_OF_CELL_DIFFERENTIATION 4.574272e-05
15: GOBP_RNA_PROCESSING 5.606705e-05
padj
<num>
1: 0.0003318063
2: 0.0003318063
3: 0.0006237116
4: 0.0006237116
5: 0.0006237116
6: 0.0006774178
7: 0.0006774178
8: 0.0006774178
9: 0.0006774178
10: 0.0006774178
11: 0.0008733407
12: 0.0009272680
13: 0.0014410766
14: 0.0016140645
15: 0.0018464747
pathway_scoring_gene_m |>
filter(padj < 0.05) |>
nrow()[1] 137
These results are more reliable than the ones of the structural variances. The most significant pathways also seem logical as they show that pathways are targeted by mutations which influence cell death and also cell cycle processes. This is very common in tumors.
Comparison of results:
overlap <- inner_join(
pathway_scoring_gene_sv |> head(n=100),
pathway_scoring_gene_m |> head(n=100), by = "pathway"
) |>
select(pathway, pval.x, pval.y)
overlap pathway pval.x pval.y
<char> <num> <num>
1: GOBP_POSITIVE_REGULATION_OF_SIGNALING 0.01115413 3.681311e-03
2: GOBP_REGULATION_OF_CELLULAR_LOCALIZATION 0.06849315 2.252473e-05
This compares the 100 most mutated pathways form both analysis. Some interesting Pathways are shown here, for example the behaviour pathways, as it is common that brain cancer patients show changes in behaviour.
FSC with gene mutation per sample
This is the same analysis as before, but here it is counted in how many samples the gene is mutated. This can alter some counts, if a sample contains multiple mutation per gene.
SV
clinical_clean <- patient_sample |> select(c(SAMPLE_ID, CANCER_TYPE, SAMPLE_TYPE, TMB_NONSYNONYMOUS))
sample_sv_count <- sv |>
select(c(Sample_Id, Site1_Hugo_Symbol, Site1_Chromosome, Site2_Hugo_Symbol, Site2_Chromosome, Class, SV_Length))|>
left_join(clinical_clean, by = c("Sample_Id" = "SAMPLE_ID")) |>
filter(CANCER_TYPE == "Glioma")|>
pivot_longer(cols = c("Site1_Hugo_Symbol", "Site2_Hugo_Symbol"), names_to = "Position", values_to = "Gene_Name")|>
filter(!is.na(Gene_Name))|>
select(Sample_Id, Gene_Name)|>
distinct()|>
count(Gene_Name, sort = TRUE)|>
left_join(BP_df |> select(gene_symbol, ensembl_gene), by = c("Gene_Name" = "gene_symbol"))|>
distinct()|>
na.omit()
sample_sv_stats <- sample_sv_count$n
names(sample_sv_stats) = sample_sv_count$ensembl_gene
pathway_scoring_sample_sv <- fgseaMultilevel(pathways= BP_list, stats = sample_sv_stats,minSize = 0, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (89.29% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
Warning in fgseaMultilevel(pathways = BP_list, stats = sample_sv_stats, : For
some of the pathways the P-values were likely overestimated. For such pathways
log2err is set to NA.
pathway_scoring_sample_sv <- arrange(pathway_scoring_sample_sv, pval)
head(pathway_scoring_sample_sv|> select(pathway, pval, padj), n =15) pathway
<char>
1: GOBP_POSITIVE_REGULATION_OF_INTRACELLULAR_SIGNAL_TRANSDUCTION
2: GOBP_POSITIVE_REGULATION_OF_SIGNALING
3: GOBP_POSITIVE_REGULATION_OF_PHOSPHORUS_METABOLIC_PROCESS
4: GOBP_POSITIVE_REGULATION_OF_PROTEIN_METABOLIC_PROCESS
5: GOBP_POSITIVE_REGULATION_OF_PROTEIN_MODIFICATION_PROCESS
6: GOBP_REGULATION_OF_INTRACELLULAR_SIGNAL_TRANSDUCTION
7: GOBP_REGULATION_OF_PHOSPHORUS_METABOLIC_PROCESS
8: GOBP_REGULATION_OF_PROTEIN_MODIFICATION_PROCESS
9: GOBP_CALCIUM_MEDIATED_SIGNALING
10: GOBP_POSITIVE_REGULATION_OF_TRANSCRIPTION_BY_RNA_POLYMERASE_II
11: GOBP_SECOND_MESSENGER_MEDIATED_SIGNALING
12: GOBP_NERVOUS_SYSTEM_PROCESS
13: GOBP_POSITIVE_REGULATION_OF_CELLULAR_BIOSYNTHETIC_PROCESS
14: GOBP_POSITIVE_REGULATION_OF_NUCLEOBASE_CONTAINING_COMPOUND_METABOLIC_PROCESS
15: GOBP_POSITIVE_REGULATION_OF_RNA_METABOLIC_PROCESS
pval padj
<num> <num>
1: 0.02246319 0.7349081
2: 0.02246319 0.7349081
3: 0.02307768 0.7349081
4: 0.02402708 0.7349081
5: 0.02402708 0.7349081
6: 0.04409673 0.7349081
7: 0.04409673 0.7349081
8: 0.04797215 0.7349081
9: 0.05298647 0.7349081
10: 0.05298647 0.7349081
11: 0.05298647 0.7349081
12: 0.05689900 0.7349081
13: 0.05696203 0.7349081
14: 0.05696203 0.7349081
15: 0.05696203 0.7349081
Mutations
mutation_gene_sample <- mutations |>
left_join(clinical_clean |> select(SAMPLE_ID, CANCER_TYPE), by = c("Tumor_Sample_Barcode" = "SAMPLE_ID")) |>
filter(CANCER_TYPE == "Glioma")|>
select(Tumor_Sample_Barcode, Hugo_Symbol) |>
distinct() |>
count(Hugo_Symbol, sort = TRUE) |>
left_join(BP_df |> select(gene_symbol, ensembl_gene), by = c("Hugo_Symbol" = "gene_symbol"))|>
distinct()|>
na.omit()
gene_mutation_stats <- mutation_gene_sample$n
names(gene_mutation_stats) = mutation_gene_sample$ensembl_gene
pathway_scoring_sample_m <- fgseaMultilevel(pathways= BP_list, stats = gene_mutation_stats,minSize = 15, maxSize = 500, scoreType = "std")Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (93.7% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
pathway_scoring_sample_m <- arrange(pathway_scoring_sample_m, padj)
head(pathway_scoring_sample_m |> select(pathway, pval, padj), n=15) pathway pval
<char> <num>
1: GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS 2.438281e-07
2: GOBP_CIRCULATORY_SYSTEM_DEVELOPMENT 1.343345e-06
3: GOBP_REGULATION_OF_APOPTOTIC_SIGNALING_PATHWAY 2.273277e-06
4: GOBP_REGULATION_OF_CELLULAR_LOCALIZATION 1.889608e-06
5: GOBP_REGULATION_OF_MITOTIC_CELL_CYCLE_PHASE_TRANSITION 1.822102e-06
6: GOBP_MITOTIC_CELL_CYCLE_PHASE_TRANSITION 4.759078e-06
7: GOBP_TUBE_DEVELOPMENT 5.495708e-06
8: GOBP_APOPTOTIC_SIGNALING_PATHWAY 8.544820e-06
9: GOBP_BLOOD_VESSEL_MORPHOGENESIS 8.422711e-06
10: GOBP_CELL_CYCLE_G1_S_PHASE_TRANSITION 9.323332e-06
11: GOBP_REGULATION_OF_CELL_CYCLE_G1_S_PHASE_TRANSITION 6.828689e-06
12: GOBP_CELL_AGING 2.176076e-05
13: GOBP_REGULATION_OF_EXTRINSIC_APOPTOTIC_SIGNALING_PATHWAY 2.078563e-05
14: GOBP_CELLULAR_RESPONSE_TO_OXYGEN_LEVELS 2.391474e-05
15: GOBP_REGULATION_OF_CELLULAR_RESPONSE_TO_STRESS 3.514915e-05
padj
<num>
1: 0.0001204511
2: 0.0002245997
3: 0.0002245997
4: 0.0002245997
5: 0.0002245997
6: 0.0003878400
7: 0.0003878400
8: 0.0004187024
9: 0.0004187024
10: 0.0004187024
11: 0.0004187024
12: 0.0008269088
13: 0.0008269088
14: 0.0008438487
15: 0.0011575788
pathway_scoring_sample_m |>
filter(padj < 0.05) |>
nrow()[1] 109
The comparison of the both counting versions, result in very similar outputs.
Comparisons of counting Methods
overlap <- inner_join(
pathway_scoring_gene_m |> filter(padj <0.05),
pathway_scoring_sample_m |> filter(padj < 0.05), by = "pathway"
) |>
select(pathway, pval.x, pval.y)
overlap pathway pval.x
<char> <num>
1: GOBP_CIRCULATORY_SYSTEM_DEVELOPMENT 7.790565e-07
2: GOBP_NEGATIVE_REGULATION_OF_DEVELOPMENTAL_PROCESS 1.343345e-06
3: GOBP_REGULATION_OF_CELL_CYCLE_G1_S_PHASE_TRANSITION 6.312870e-06
4: GOBP_REGULATION_OF_MITOTIC_CELL_CYCLE_PHASE_TRANSITION 5.853040e-06
5: GOBP_TUBE_DEVELOPMENT 6.186984e-06
---
99: GOBP_PHOSPHOLIPID_METABOLIC_PROCESS 1.066403e-02
100: GOBP_RESPONSE_TO_METAL_ION 1.102778e-02
101: GOBP_RESPONSE_TO_WOUNDING 1.129537e-02
102: GOBP_PROTEIN_DNA_COMPLEX_SUBUNIT_ORGANIZATION 1.232692e-02
103: GOBP_RESPONSE_TO_INORGANIC_SUBSTANCE 1.371649e-02
pval.y
<num>
1: 1.343345e-06
2: 2.438281e-07
3: 6.828689e-06
4: 1.822102e-06
5: 5.495708e-06
---
99: 8.731094e-03
100: 8.114109e-03
101: 1.041858e-02
102: 5.628665e-03
103: 8.831568e-03
Move HTML files to Results
if(!dir.exists(here("results"))) dir.create(here("results"))
html_files <- list.files(here("R"), pattern = "\\.html$", full.names = TRUE)
file.rename(from = html_files, to = file.path(here("results"), basename(html_files)))